* [dpdk-dev] [PATCH RFC] lib/librte_vhost: vhost-user @ 2014-11-15 1:14 Huawei Xie 2014-11-17 6:04 ` Tetsuya Mukawa ` (2 more replies) 0 siblings, 3 replies; 6+ messages in thread From: Huawei Xie @ 2014-11-15 1:14 UTC (permalink / raw) To: dev implement socket server fd event dispatch mechanism vhost sock message handling memory map for each region VHOST_USER_SET_VRING_KICK_FD as the indicator that vring is available VHOST_USER_GET_VRING_BASE as the message that vring should be released The message flow between vhost-user and vhost-cuse is kindof different, which makes virtio-net common message handler layer difficult and complicated to handle both cases in new_device/destroy_device/memory map/resource cleanup. Will only leave the most common messag handling in virtio-net, and move the control logic to cuse/fuse layer. Signed-off-by: Huawei Xie <huawei.xie@intel.com> --- lib/librte_vhost/Makefile | 14 +- lib/librte_vhost/eventfd_link/eventfd_link.c | 27 +- lib/librte_vhost/eventfd_link/eventfd_link.h | 48 +- lib/librte_vhost/libvirt/qemu-wrap.py | 367 --------------- lib/librte_vhost/rte_virtio_net.h | 106 ++--- lib/librte_vhost/vhost-cuse/vhost-net-cdev.c | 436 ++++++++++++++++++ lib/librte_vhost/vhost-cuse/virtio-net-cdev.c | 314 +++++++++++++ lib/librte_vhost/vhost-cuse/virtio-net-cdev.h | 43 ++ lib/librte_vhost/vhost-net-cdev.c | 389 ---------------- lib/librte_vhost/vhost-net-cdev.h | 113 ----- lib/librte_vhost/vhost-user/fd_man.c | 158 +++++++ lib/librte_vhost/vhost-user/fd_man.h | 31 ++ lib/librte_vhost/vhost-user/vhost-net-user.c | 417 +++++++++++++++++ lib/librte_vhost/vhost-user/vhost-net-user.h | 74 +++ lib/librte_vhost/vhost-user/virtio-net-user.c | 208 +++++++++ lib/librte_vhost/vhost-user/virtio-net-user.h | 11 + lib/librte_vhost/vhost_rxtx.c | 625 ++++---------------------- lib/librte_vhost/virtio-net.c | 450 ++++--------------- 18 files changed, 1939 insertions(+), 1892 deletions(-) delete mode 100755 lib/librte_vhost/libvirt/qemu-wrap.py create mode 100644 lib/librte_vhost/vhost-cuse/vhost-net-cdev.c create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.c create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.h delete mode 100644 lib/librte_vhost/vhost-net-cdev.c delete mode 100644 lib/librte_vhost/vhost-net-cdev.h create mode 100644 lib/librte_vhost/vhost-user/fd_man.c create mode 100644 lib/librte_vhost/vhost-user/fd_man.h create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.c create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.h create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.c create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.h diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index c008d64..cb4e172 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -34,17 +34,19 @@ include $(RTE_SDK)/mk/rte.vars.mk # library name LIB = librte_vhost.a -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 -lfuse +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I. -I vhost-user -I vhost-cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse LDFLAGS += -lfuse # all source are stored in SRCS-y -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-net-cdev.c virtio-net.c vhost_rxtx.c +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-cuse/vhost-net-cdev.c vhost-cuse/virtio-net-cdev.c + +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-user/fd_man.c vhost-user/vhost-net-user.c vhost-user/virtio-net-user.c + +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net.c vhost_rxtx.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h -# dependencies -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_ether -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mbuf +# this lib needs eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal lib/librte_mbuf include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.c b/lib/librte_vhost/eventfd_link/eventfd_link.c index 7755dd6..4c9b628 100644 --- a/lib/librte_vhost/eventfd_link/eventfd_link.c +++ b/lib/librte_vhost/eventfd_link/eventfd_link.c @@ -13,8 +13,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * along with this program; If not, see <http://www.gnu.org/licenses/>. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * @@ -78,8 +77,7 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) switch (ioctl) { case EVENTFD_COPY: - if (copy_from_user(&eventfd_copy, argp, - sizeof(struct eventfd_copy))) + if (copy_from_user(&eventfd_copy, argp, sizeof(struct eventfd_copy))) return -EFAULT; /* @@ -88,28 +86,28 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) task_target = pid_task(find_vpid(eventfd_copy.target_pid), PIDTYPE_PID); if (task_target == NULL) { - pr_debug("Failed to get mem ctx for target pid\n"); + printk(KERN_DEBUG "Failed to get mem ctx for target pid\n"); return -EFAULT; } files = get_files_struct(current); if (files == NULL) { - pr_debug("Failed to get files struct\n"); + printk(KERN_DEBUG "Failed to get files struct\n"); return -EFAULT; } rcu_read_lock(); file = fcheck_files(files, eventfd_copy.source_fd); if (file) { - if (file->f_mode & FMODE_PATH || - !atomic_long_inc_not_zero(&file->f_count)) + if (file->f_mode & FMODE_PATH + || !atomic_long_inc_not_zero(&file->f_count)) file = NULL; } rcu_read_unlock(); put_files_struct(files); if (file == NULL) { - pr_debug("Failed to get file from source pid\n"); + printk(KERN_DEBUG "Failed to get file from source pid\n"); return 0; } @@ -128,25 +126,26 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) files = get_files_struct(task_target); if (files == NULL) { - pr_debug("Failed to get files struct\n"); + printk(KERN_DEBUG "Failed to get files struct\n"); return -EFAULT; } rcu_read_lock(); file = fcheck_files(files, eventfd_copy.target_fd); if (file) { - if (file->f_mode & FMODE_PATH || - !atomic_long_inc_not_zero(&file->f_count)) - file = NULL; + if (file->f_mode & FMODE_PATH + || !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; } rcu_read_unlock(); put_files_struct(files); if (file == NULL) { - pr_debug("Failed to get file from target pid\n"); + printk(KERN_DEBUG "Failed to get file from target pid\n"); return 0; } + /* * Install the file struct from the target process into the * file desciptor of the source process, diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.h b/lib/librte_vhost/eventfd_link/eventfd_link.h index ea619ec..38052e2 100644 --- a/lib/librte_vhost/eventfd_link/eventfd_link.h +++ b/lib/librte_vhost/eventfd_link/eventfd_link.h @@ -1,7 +1,4 @@ /*- - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * * GPL LICENSE SUMMARY * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. @@ -16,61 +13,28 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * along with this program; If not, see <http://www.gnu.org/licenses/>. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * * Contact Information: * Intel Corporation - * - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * */ #ifndef _EVENTFD_LINK_H_ #define _EVENTFD_LINK_H_ /* - * ioctl to copy an fd entry in calling process to an fd in a target process + * ioctl to copy an fd entry in calling process to an fd in a target process */ #define EVENTFD_COPY 1 /* - * arguements for the EVENTFD_COPY ioctl + * arguements for the EVENTFD_COPY ioctl */ struct eventfd_copy { - unsigned target_fd; /* fd in the target pid */ - unsigned source_fd; /* fd in the calling pid */ - pid_t target_pid; /* pid of the target pid */ + unsigned target_fd; /**< fd in the target pid */ + unsigned source_fd; /**< fd in the calling pid */ + pid_t target_pid; /**< pid of the target pid */ }; #endif /* _EVENTFD_LINK_H_ */ diff --git a/lib/librte_vhost/libvirt/qemu-wrap.py b/lib/librte_vhost/libvirt/qemu-wrap.py deleted file mode 100755 index e2d68a0..0000000 --- a/lib/librte_vhost/libvirt/qemu-wrap.py +++ /dev/null @@ -1,367 +0,0 @@ -#!/usr/bin/python -#/* -# * BSD LICENSE -# * -# * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. -# * All rights reserved. -# * -# * Redistribution and use in source and binary forms, with or without -# * modification, are permitted provided that the following conditions -# * are met: -# * -# * * Redistributions of source code must retain the above copyright -# * notice, this list of conditions and the following disclaimer. -# * * Redistributions in binary form must reproduce the above copyright -# * notice, this list of conditions and the following disclaimer in -# * the documentation and/or other materials provided with the -# * distribution. -# * * Neither the name of Intel Corporation nor the names of its -# * contributors may be used to endorse or promote products derived -# * from this software without specific prior written permission. -# * -# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# */ - -##################################################################### -# This script is designed to modify the call to the QEMU emulator -# to support userspace vhost when starting a guest machine through -# libvirt with vhost enabled. The steps to enable this are as follows -# and should be run as root: -# -# 1. Place this script in a libvirtd's binary search PATH ($PATH) -# A good location would be in the same directory that the QEMU -# binary is located -# -# 2. Ensure that the script has the same owner/group and file -# permissions as the QEMU binary -# -# 3. Update the VM xml file using "virsh edit VM.xml" -# -# 3.a) Set the VM to use the launch script -# -# Set the emulator path contained in the -# <emulator><emulator/> tags -# -# e.g replace <emulator>/usr/bin/qemu-kvm<emulator/> -# with <emulator>/usr/bin/qemu-wrap.py<emulator/> -# -# 3.b) Set the VM's device's to use vhost-net offload -# -# <interface type="network"> -# <model type="virtio"/> -# <driver name="vhost"/> -# <interface/> -# -# 4. Enable libvirt to access our userpace device file by adding it to -# controllers cgroup for libvirtd using the following steps -# -# 4.a) In /etc/libvirt/qemu.conf add/edit the following lines: -# 1) cgroup_controllers = [ ... "devices", ... ] -# 2) clear_emulator_capabilities = 0 -# 3) user = "root" -# 4) group = "root" -# 5) cgroup_device_acl = [ -# "/dev/null", "/dev/full", "/dev/zero", -# "/dev/random", "/dev/urandom", -# "/dev/ptmx", "/dev/kvm", "/dev/kqemu", -# "/dev/rtc", "/dev/hpet", "/dev/net/tun", -# "/dev/<devbase-name>-<index>", -# ] -# -# 4.b) Disable SELinux or set to permissive mode -# -# 4.c) Mount cgroup device controller -# "mkdir /dev/cgroup" -# "mount -t cgroup none /dev/cgroup -o devices" -# -# 4.d) Set hugetlbfs_mount variable - ( Optional ) -# VMs using userspace vhost must use hugepage backed -# memory. This can be enabled in the libvirt XML -# config by adding a memory backing section to the -# XML config e.g. -# <memoryBacking> -# <hugepages/> -# </memoryBacking> -# This memory backing section should be added after the -# <memory> and <currentMemory> sections. This will add -# flags "-mem-prealloc -mem-path <path>" to the QEMU -# command line. The hugetlbfs_mount variable can be used -# to override the default <path> passed through by libvirt. -# -# if "-mem-prealloc" or "-mem-path <path>" are not passed -# through and a vhost device is detected then these options will -# be automatically added by this script. This script will detect -# the system hugetlbfs mount point to be used for <path>. The -# default <path> for this script can be overidden by the -# hugetlbfs_dir variable in the configuration section of this script. -# -# -# 4.e) Restart the libvirtd system process -# e.g. on Fedora "systemctl restart libvirtd.service" -# -# -# 4.f) Edit the Configuration Parameters section of this script -# to point to the correct emulator location and set any -# addition options -# -# The script modifies the libvirtd Qemu call by modifying/adding -# options based on the configuration parameters below. -# NOTE: -# emul_path and us_vhost_path must be set -# All other parameters are optional -##################################################################### - - -############################################# -# Configuration Parameters -############################################# -#Path to QEMU binary -emul_path = "/usr/local/bin/qemu-system-x86_64" - -#Path to userspace vhost device file -# This filename should match the --dev-basename --dev-index parameters of -# the command used to launch the userspace vhost sample application e.g. -# if the sample app lauch command is: -# ./build/vhost-switch ..... --dev-basename usvhost --dev-index 1 -# then this variable should be set to: -# us_vhost_path = "/dev/usvhost-1" -us_vhost_path = "/dev/usvhost-1" - -#List of additional user defined emulation options. These options will -#be added to all Qemu calls -emul_opts_user = [] - -#List of additional user defined emulation options for vhost only. -#These options will only be added to vhost enabled guests -emul_opts_user_vhost = [] - -#For all VHOST enabled VMs, the VM memory is preallocated from hugetlbfs -# Set this variable to one to enable this option for all VMs -use_huge_all = 0 - -#Instead of autodetecting, override the hugetlbfs directory by setting -#this variable -hugetlbfs_dir = "" - -############################################# - - -############################################# -# ****** Do Not Modify Below this Line ****** -############################################# - -import sys, os, subprocess - - -#List of open userspace vhost file descriptors -fd_list = [] - -#additional virtio device flags when using userspace vhost -vhost_flags = [ "csum=off", - "gso=off", - "guest_tso4=off", - "guest_tso6=off", - "guest_ecn=off" - ] - - -############################################# -# Find the system hugefile mount point. -# Note: -# if multiple hugetlbfs mount points exist -# then the first one found will be used -############################################# -def find_huge_mount(): - - if (len(hugetlbfs_dir)): - return hugetlbfs_dir - - huge_mount = "" - - if (os.access("/proc/mounts", os.F_OK)): - f = open("/proc/mounts", "r") - line = f.readline() - while line: - line_split = line.split(" ") - if line_split[2] == 'hugetlbfs': - huge_mount = line_split[1] - break - line = f.readline() - else: - print "/proc/mounts not found" - exit (1) - - f.close - if len(huge_mount) == 0: - print "Failed to find hugetlbfs mount point" - exit (1) - - return huge_mount - - -############################################# -# Get a userspace Vhost file descriptor -############################################# -def get_vhost_fd(): - - if (os.access(us_vhost_path, os.F_OK)): - fd = os.open( us_vhost_path, os.O_RDWR) - else: - print ("US-Vhost file %s not found" %us_vhost_path) - exit (1) - - return fd - - -############################################# -# Check for vhostfd. if found then replace -# with our own vhost fd and append any vhost -# flags onto the end -############################################# -def modify_netdev_arg(arg): - - global fd_list - vhost_in_use = 0 - s = '' - new_opts = [] - netdev_opts = arg.split(",") - - for opt in netdev_opts: - #check if vhost is used - if "vhost" == opt[:5]: - vhost_in_use = 1 - else: - new_opts.append(opt) - - #if using vhost append vhost options - if vhost_in_use == 1: - #append vhost on option - new_opts.append('vhost=on') - #append vhostfd ption - new_fd = get_vhost_fd() - new_opts.append('vhostfd=' + str(new_fd)) - fd_list.append(new_fd) - - #concatenate all options - for opt in new_opts: - if len(s) > 0: - s+=',' - - s+=opt - - return s - - -############################################# -# Main -############################################# -def main(): - - global fd_list - global vhost_in_use - new_args = [] - num_cmd_args = len(sys.argv) - emul_call = '' - mem_prealloc_set = 0 - mem_path_set = 0 - num = 0; - - #parse the parameters - while (num < num_cmd_args): - arg = sys.argv[num] - - #Check netdev +1 parameter for vhostfd - if arg == '-netdev': - num_vhost_devs = len(fd_list) - new_args.append(arg) - - num+=1 - arg = sys.argv[num] - mod_arg = modify_netdev_arg(arg) - new_args.append(mod_arg) - - #append vhost flags if this is a vhost device - # and -device is the next arg - # i.e -device -opt1,-opt2,...,-opt3,%vhost - if (num_vhost_devs < len(fd_list)): - num+=1 - arg = sys.argv[num] - if arg == '-device': - new_args.append(arg) - num+=1 - new_arg = sys.argv[num] - for flag in vhost_flags: - new_arg = ''.join([new_arg,',',flag]) - new_args.append(new_arg) - else: - new_args.append(arg) - elif arg == '-mem-prealloc': - mem_prealloc_set = 1 - new_args.append(arg) - elif arg == '-mem-path': - mem_path_set = 1 - new_args.append(arg) - - else: - new_args.append(arg) - - num+=1 - - #Set Qemu binary location - emul_call+=emul_path - emul_call+=" " - - #Add prealloc mem options if using vhost and not already added - if ((len(fd_list) > 0) and (mem_prealloc_set == 0)): - emul_call += "-mem-prealloc " - - #Add mempath mem options if using vhost and not already added - if ((len(fd_list) > 0) and (mem_path_set == 0)): - #Detect and add hugetlbfs mount point - mp = find_huge_mount() - mp = "".join(["-mem-path ", mp]) - emul_call += mp - emul_call += " " - - - #add user options - for opt in emul_opts_user: - emul_call += opt - emul_call += " " - - #Add add user vhost only options - if len(fd_list) > 0: - for opt in emul_opts_user_vhost: - emul_call += opt - emul_call += " " - - #Add updated libvirt options - iter_args = iter(new_args) - #skip 1st arg i.e. call to this script - next(iter_args) - for arg in iter_args: - emul_call+=str(arg) - emul_call+= " " - - #Call QEMU - subprocess.call(emul_call, shell=True) - - - #Close usvhost files - for fd in fd_list: - os.close(fd) - - -if __name__ == "__main__": - main() - diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 00b1328..7a05dab 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -34,11 +34,6 @@ #ifndef _VIRTIO_NET_H_ #define _VIRTIO_NET_H_ -/** - * @file - * Interface to vhost net - */ - #include <stdint.h> #include <linux/virtio_ring.h> #include <linux/virtio_net.h> @@ -48,66 +43,38 @@ #include <rte_mempool.h> #include <rte_mbuf.h> -/* Used to indicate that the device is running on a data core */ -#define VIRTIO_DEV_RUNNING 1 - -/* Backend value set by guest. */ -#define VIRTIO_DEV_STOPPED -1 - +#define VIRTIO_DEV_RUNNING 1 /**< Used to indicate that the device is running on a data core. */ +#define VIRTIO_DEV_STOPPED -1 /**< Backend value set by guest. */ /* Enum for virtqueue management. */ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; -#define BUF_VECTOR_MAX 256 - -/** - * Structure contains buffer address, length and descriptor index - * from vring to do scatter RX. - */ -struct buf_vector { - uint64_t buf_addr; - uint32_t buf_len; - uint32_t desc_idx; -}; - /** * Structure contains variables relevant to RX/TX virtqueues. */ struct vhost_virtqueue { - struct vring_desc *desc; /**< Virtqueue descriptor ring. */ - struct vring_avail *avail; /**< Virtqueue available ring. */ - struct vring_used *used; /**< Virtqueue used ring. */ - uint32_t size; /**< Size of descriptor ring. */ - uint32_t backend; /**< Backend value to determine if device should started/stopped. */ - uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ - volatile uint16_t last_used_idx; /**< Last index used on the available ring */ - volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ - eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ - eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ - struct buf_vector buf_vec[BUF_VECTOR_MAX]; /**< for scatter RX. */ -} __rte_cache_aligned; - -/** - * Device structure contains all configuration information relating to the device. - */ -struct virtio_net { - struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ - struct virtio_memory *mem; /**< QEMU memory and memory region information. */ - uint64_t features; /**< Negotiated feature set. */ - uint64_t device_fh; /**< device identifier. */ - uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ - void *priv; /**< private context */ + struct vring_desc *desc; /**< descriptor ring. */ + struct vring_avail *avail; /**< available ring. */ + struct vring_used *used; /**< used ring. */ + uint32_t size; /**< Size of descriptor ring. */ + uint32_t backend; /**< Backend value to determine if device should be started/stopped. */ + uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ + volatile uint16_t last_used_idx; /**< Last index used on the available ring. */ + volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ + eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ + eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ } __rte_cache_aligned; /** - * Information relating to memory regions including offsets to addresses in QEMUs memory file. + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. */ struct virtio_memory_regions { - uint64_t guest_phys_address; /**< Base guest physical address of region. */ - uint64_t guest_phys_address_end; /**< End guest physical address of region. */ - uint64_t memory_size; /**< Size of region. */ - uint64_t userspace_address; /**< Base userspace address of region. */ - uint64_t address_offset; /**< Offset of region for address translation. */ + uint64_t guest_phys_address; /**< Base guest physical address of region. */ + uint64_t guest_phys_address_end; /**< End guest physical address of region. */ + uint64_t memory_size; /**< Size of region. */ + uint64_t userspace_address; /**< Base userspace address of region. */ + uint64_t address_offset; /**< Offset of region for address translation. */ }; @@ -115,21 +82,34 @@ struct virtio_memory_regions { * Memory structure includes region and mapping information. */ struct virtio_memory { - uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ - uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ - uint64_t mapped_size; /**< Total size of memory file. */ - uint32_t nregions; /**< Number of memory regions. */ + uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ + uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ + uint64_t mapped_size; /**< Total size of memory file. */ + uint32_t nregions; /**< Number of memory regions. */ struct virtio_memory_regions regions[0]; /**< Memory region information. */ }; /** + * Device structure contains all configuration information relating to the device. + */ +struct virtio_net { + struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ + struct virtio_memory *mem; /**< QEMU memory and memory region information. */ + uint64_t features; /**< Negotiated feature set. */ + uint64_t device_fh; /**< Device identifier. */ + uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ + void *priv; +} __rte_cache_aligned; + +/** * Device operations to add/remove device. */ struct virtio_net_device_ops { - int (*new_device)(struct virtio_net *); /**< Add device. */ - void (*destroy_device)(volatile struct virtio_net *); /**< Remove device. */ + int (*new_device)(struct virtio_net *); /**< Add device. */ + void (*destroy_device)(struct virtio_net *); /**< Remove device. */ }; + static inline uint16_t __attribute__((always_inline)) rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id) { @@ -179,7 +159,7 @@ int rte_vhost_driver_register(const char *dev_name); /* Register callbacks. */ int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); -/* Start vhost driver session blocking loop. */ + int rte_vhost_driver_session_start(void); /** @@ -192,8 +172,8 @@ int rte_vhost_driver_session_start(void); * @return * num of packets enqueued */ -uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count); +uint32_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count); /** * This function gets guest buffers from the virtio device TX virtqueue, @@ -206,7 +186,7 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, * @return * num of packets dequeued */ -uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); +uint32_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count); #endif /* _VIRTIO_NET_H_ */ diff --git a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c new file mode 100644 index 0000000..4671643 --- /dev/null +++ b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c @@ -0,0 +1,436 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <fuse/cuse_lowlevel.h> +#include <linux/limits.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <string.h> +#include <unistd.h> +#include <sys/ioctl.h> + +#include <rte_ethdev.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_virtio_net.h> + +#include "virtio-net-cdev.h" +#include "vhost-net.h" +#include "eventfd_link/eventfd_link.h" + +#define FUSE_OPT_DUMMY "\0\0" +#define FUSE_OPT_FORE "-f\0\0" +#define FUSE_OPT_NOMULTI "-s\0\0" + +static const uint32_t default_major = 231; +static const uint32_t default_minor = 1; +static const char cuse_device_name[] = "/dev/cuse"; +static const char default_cdev[] = "vhost-net"; +static const char eventfd_cdev[] = "/dev/eventfd-link"; + +static struct fuse_session *session; +const struct vhost_net_device_ops const *ops; + +/* + * Returns vhost_device_ctx from given fuse_req_t. The index is populated later + * when the device is added to the device linked list. + */ +static struct vhost_device_ctx +fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) +{ + struct vhost_device_ctx ctx; + struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); + + ctx.pid = req_ctx->pid; + ctx.fh = fi->fh; + + return ctx; +} + +/* + * When the device is created in QEMU it gets initialised here and + * added to the device linked list. + */ +static void +vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) +{ + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + int err = 0; + + err = ops->new_device(ctx); + if (err == -1) { + fuse_reply_err(req, EPERM); + return; + } + + fi->fh = err; + + RTE_LOG(INFO, VHOST_CONFIG, + "(%"PRIu64") Device configuration started\n", fi->fh); + fuse_reply_open(req, fi); +} + +/* + * When QEMU is shutdown or killed the device gets released. + */ +static void +vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) +{ + int err = 0; + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + + ops->destroy_device(ctx); + RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); + fuse_reply_err(req, err); +} + +/* + * Boilerplate code for CUSE IOCTL + * Implicit arguments: ctx, req, result. + */ +#define VHOST_IOCTL(func) do { \ + result = (func)(ctx); \ + fuse_reply_ioctl(req, result, NULL, 0); \ +} while (0) + +/* + * Boilerplate IOCTL RETRY + * Implicit arguments: req. + */ +#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ + struct iovec iov_r = { arg, (size_r) }; \ + struct iovec iov_w = { arg, (size_w) }; \ + fuse_reply_ioctl_retry(req, &iov_r, \ + (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ +} while (0) + +/* + * Boilerplate code for CUSE Read IOCTL + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. + */ +#define VHOST_IOCTL_R(type, var, func) do { \ + if (!in_bufsz) { \ + VHOST_IOCTL_RETRY(sizeof(type), 0);\ + } else { \ + (var) = *(const type*)in_buf; \ + result = func(ctx, &(var)); \ + fuse_reply_ioctl(req, result, NULL, 0);\ + } \ +} while (0) + +/* + * Boilerplate code for CUSE Write IOCTL + * Implicit arguments: ctx, req, result, out_bufsz. + */ +#define VHOST_IOCTL_W(type, var, func) do { \ + if (!out_bufsz) { \ + VHOST_IOCTL_RETRY(0, sizeof(type));\ + } else { \ + result = (func)(ctx, &(var));\ + fuse_reply_ioctl(req, result, &(var), sizeof(type));\ + } \ +} while (0) + +/* + * Boilerplate code for CUSE Read/Write IOCTL + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. + */ +#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ + if (!in_bufsz) { \ + VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ + } else { \ + (var1) = *(const type1*) (in_buf); \ + result = (func)(ctx, (var1), &(var2)); \ + fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ + } \ +} while (0) + +/* + * This function uses the eventfd_link kernel module to copy an eventfd file + * descriptor provided by QEMU in to our process space. + */ +static int +eventfd_copy(int target_fd, int target_pid) +{ + int eventfd_link, ret; + struct eventfd_copy eventfd_copy; + int fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + + if (fd == -1) + return -1; + + /* Open the character device to the kernel module. */ + /* TODO: check this earlier rather than fail until VM boots! */ + eventfd_link = open(eventfd_cdev, O_RDWR); + if (eventfd_link < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "eventfd_link module is not loaded\n"); + return -1; + } + + eventfd_copy.source_fd = fd; + eventfd_copy.target_fd = target_fd; + eventfd_copy.target_pid = target_pid; + /* Call the IOCTL to copy the eventfd. */ + ret = ioctl(eventfd_link, EVENTFD_COPY, &eventfd_copy); + close(eventfd_link); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "EVENTFD_COPY ioctl failed\n"); + return -1; + } + + return fd; +} + +/* + * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on + * the type of IOCTL a buffer is requested to read or to write. This + * request is handled by FUSE and the buffer is then given to CUSE. + */ +static void +vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, __rte_unused unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + struct vhost_vring_file file; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + uint64_t features; + uint32_t index; + int result = 0; + + switch (cmd) { + case VHOST_NET_SET_BACKEND: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); + break; + + case VHOST_GET_FEATURES: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); + VHOST_IOCTL_W(uint64_t, features, ops->get_features); + break; + + case VHOST_SET_FEATURES: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); + VHOST_IOCTL_R(uint64_t, features, ops->set_features); + break; + + case VHOST_RESET_OWNER: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); + VHOST_IOCTL(ops->reset_owner); + break; + + case VHOST_SET_OWNER: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); + VHOST_IOCTL(ops->set_owner); + break; + + case VHOST_SET_MEM_TABLE: + /*TODO fix race condition.*/ + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); + static struct vhost_memory mem_temp; + switch (in_bufsz) { + case 0: + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); + break; + + case sizeof(struct vhost_memory): + mem_temp = *(const struct vhost_memory *) in_buf; + + if (mem_temp.nregions > 0) { + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + + (sizeof(struct vhost_memory_region) * + mem_temp.nregions), 0); + } else { + result = -1; + fuse_reply_ioctl(req, result, NULL, 0); + } + break; + + default: + result = cuse_set_mem_table(ctx, in_buf, + mem_temp.nregions); + if (result) + fuse_reply_err(req, EINVAL); + else + fuse_reply_ioctl(req, result, NULL, 0); + } + break; + + case VHOST_SET_VRING_NUM: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_num); + break; + + case VHOST_SET_VRING_BASE: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_base); + break; + + case VHOST_GET_VRING_BASE: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); + VHOST_IOCTL_RW(uint32_t, index, + struct vhost_vring_state, state, ops->get_vring_base); + break; + + case VHOST_SET_VRING_ADDR: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_addr, addr, ops->set_vring_addr); + break; + + case VHOST_SET_VRING_KICK: + case VHOST_SET_VRING_CALL: + if (!in_buf) { + VHOST_IOCTL_RETRY(sizeof(struct vhost_vring_file), 0); + } else { + int fd; + file = *(const struct vhost_vring_file *)in_buf; + LOG_DEBUG(VHOST_CONFIG, + "kick/call idx:%d fd:%d\n", file.index, file.fd); + if ((fd = eventfd_copy(file.fd, ctx.pid)) < 0){ + fuse_reply_ioctl(req, -1, NULL, 0); + } + file.fd = fd; + if (cmd == VHOST_SET_VRING_KICK) { + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_call); + } + else { + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_kick); + } + } + break; + + default: + RTE_LOG(ERR, VHOST_CONFIG, + "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); + result = -1; + fuse_reply_ioctl(req, result, NULL, 0); + } + + if (result < 0) + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); + else + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); +} + +/* + * Structure handling open, release and ioctl function pointers is populated. + */ +static const struct cuse_lowlevel_ops vhost_net_ops = { + .open = vhost_net_open, + .release = vhost_net_release, + .ioctl = vhost_net_ioctl, +}; + +/* + * cuse_info is populated and used to register the cuse device. + * vhost_net_device_ops are also passed when the device is registered in app. + */ +int +rte_vhost_driver_register(const char *dev_name) +{ + struct cuse_info cuse_info; + char device_name[PATH_MAX] = ""; + char char_device_name[PATH_MAX] = ""; + const char *device_argv[] = { device_name }; + + char fuse_opt_dummy[] = FUSE_OPT_DUMMY; + char fuse_opt_fore[] = FUSE_OPT_FORE; + char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; + char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; + + if (access(cuse_device_name, R_OK | W_OK) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "char device %s can't be accessed, maybe not exist\n", + cuse_device_name); + return -1; + } + + /* + * The device name is created. This is passed to QEMU so that it can + * register the device with our application. + */ + snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); + snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); + + /* Check if device already exists. */ + if (access(char_device_name, F_OK) != -1) { + RTE_LOG(ERR, VHOST_CONFIG, + "char device %s already exists\n", char_device_name); + return -1; + } + + memset(&cuse_info, 0, sizeof(cuse_info)); + cuse_info.dev_major = default_major; + cuse_info.dev_minor = default_minor; + cuse_info.dev_info_argc = 1; + cuse_info.dev_info_argv = device_argv; + cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; + + ops = get_virtio_net_callbacks(); + + session = cuse_lowlevel_setup(3, fuse_argv, + &cuse_info, &vhost_net_ops, 0, NULL); + if (session == NULL) + return -1; + + return 0; +} + +/** + * The CUSE session is launched allowing the application to receive open, + * release and ioctl calls. + */ +int +rte_vhost_driver_session_start(void) +{ + fuse_session_loop(session); + + return 0; +} diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c new file mode 100644 index 0000000..5c16aa5 --- /dev/null +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c @@ -0,0 +1,314 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <dirent.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <fuse/cuse_lowlevel.h> +#include <stddef.h> +#include <string.h> +#include <stdlib.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> +#include <errno.h> + +#include <rte_log.h> + +#include "vhost-net.h" +#include "virtio-net-cdev.h" + +extern struct vhost_net_device_ops const *ops; + +/* Line size for reading maps file. */ +static const uint32_t BUFSIZE = PATH_MAX; + +/* Size of prot char array in procmap. */ +#define PROT_SZ 5 + +/* Number of elements in procmap struct. */ +#define PROCMAP_SZ 8 + +/* Structure containing information gathered from maps file. */ +struct procmap { + uint64_t va_start; /* Start virtual address in file. */ + uint64_t len; /* Size of file. */ + uint64_t pgoff; /* Not used. */ + uint32_t maj; /* Not used. */ + uint32_t min; /* Not used. */ + uint32_t ino; /* Not used. */ + char prot[PROT_SZ]; /* Not used. */ + char fname[PATH_MAX]; /* File name. */ +}; + +/* + * Locate the file containing QEMU's memory space and + * map it to our address space. + */ +static int +host_memory_map(pid_t pid, uint64_t addr, + uint64_t *mapped_address, uint64_t *mapped_size) +{ + struct dirent *dptr = NULL; + struct procmap procmap; + DIR *dp = NULL; + int fd; + int i; + char memfile[PATH_MAX]; + char mapfile[PATH_MAX]; + char procdir[PATH_MAX]; + char resolved_path[PATH_MAX]; + FILE *fmap; + void *map; + uint8_t found = 0; + char line[BUFSIZE]; + char dlm[] = "- : "; + char *str, *sp, *in[PROCMAP_SZ]; + char *end = NULL; + + /* Path where mem files are located. */ + snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); + /* Maps file used to locate mem file. */ + snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); + + fmap = fopen(mapfile, "r"); + if (fmap == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to open maps file for pid %d\n", pid); + return -1; + } + + /* Read through maps file until we find out base_address. */ + while (fgets(line, BUFSIZE, fmap) != 0) { + str = line; + errno = 0; + /* Split line in to fields. */ + for (i = 0; i < PROCMAP_SZ; i++) { + in[i] = strtok_r(str, &dlm[i], &sp); + if ((in[i] == NULL) || (errno != 0)) { + fclose(fmap); + return -1; + } + str = NULL; + } + + /* Convert/Copy each field as needed. */ + procmap.va_start = strtoull(in[0], &end, 16); + if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.len = strtoull(in[1], &end, 16); + if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.pgoff = strtoull(in[3], &end, 16); + if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.maj = strtoul(in[4], &end, 16); + if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.min = strtoul(in[5], &end, 16); + if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.ino = strtoul(in[6], &end, 16); + if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + memcpy(&procmap.prot, in[2], PROT_SZ); + memcpy(&procmap.fname, in[7], PATH_MAX); + + if (procmap.va_start == addr) { + procmap.len = procmap.len - procmap.va_start; + found = 1; + break; + } + } + fclose(fmap); + + if (!found) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find memory file in pid %d maps file\n", pid); + return -1; + } + + /* Find the guest memory file among the process fds. */ + dp = opendir(procdir); + if (dp == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Cannot open pid %d process directory\n", + pid); + return -1; + + } + + found = 0; + + /* Read the fd directory contents. */ + while (NULL != (dptr = readdir(dp))) { + snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", + pid, dptr->d_name); + realpath(memfile, resolved_path); + if (resolved_path == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to resolve fd directory\n"); + closedir(dp); + return -1; + } + if (strncmp(resolved_path, procmap.fname, + strnlen(procmap.fname, PATH_MAX)) == 0) { + found = 1; + break; + } + } + + closedir(dp); + + if (found == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find memory file for pid %d\n", + pid); + return -1; + } + /* Open the shared memory file and map the memory into this process. */ + fd = open(memfile, O_RDWR); + + if (fd == -1) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to open %s for pid %d\n", + memfile, pid); + return -1; + } + + map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE , + MAP_POPULATE|MAP_SHARED, fd, 0); + close(fd); + + if (map == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "Error mapping the file %s for pid %d\n", + memfile, pid); + return -1; + } + + /* Store the memory address and size in the device data structure */ + *mapped_address = (uint64_t)(uintptr_t)map; + *mapped_size = procmap.len; + + LOG_DEBUG(VHOST_CONFIG, + "Mem File: %s->%s - Size: %llu - VA: %p\n", + memfile, resolved_path, + (unsigned long long)mapped_size, map); + + return 0; +} + +int +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, + uint32_t nregions) +{ + uint64_t size = offsetof(struct vhost_memory, regions); + uint32_t idx; + struct virtio_memory_regions regions[8]; /* VHOST_MAX_MEMORY_REGIONS */ + struct vhost_memory_region *mem_regions = (void *)(uintptr_t) + ((uint64_t)(uintptr_t)mem_regions_addr + size); + uint64_t base_address = 0, mapped_address, mapped_size; + + for (idx = 0; idx < nregions; idx++) { + regions[idx].guest_phys_address = + mem_regions[idx].guest_phys_addr; + regions[idx].guest_phys_address_end = + regions[idx].guest_phys_address + + mem_regions[idx].memory_size; + regions[idx].memory_size = + mem_regions[idx].memory_size; + regions[idx].userspace_address = + mem_regions[idx].userspace_addr; + + LOG_DEBUG(VHOST_CONFIG, "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", + idx, + (void *)(uintptr_t)regions[idx].guest_phys_address, + (void *)(uintptr_t)regions[idx].userspace_address, + regions[idx].memory_size); + + /*set the base address mapping*/ + if (regions[idx].guest_phys_address == 0x0) { + base_address = + regions[idx].userspace_address; + /* Map VM memory file */ + if (host_memory_map(ctx.pid, base_address, + &mapped_address, &mapped_size) != 0) { + return -1; + } + } + } + + /* Check that we have a valid base address. */ + if (base_address == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find base address of qemu memory file.\n"); + return -1; + } + + for (idx = 0; idx < nregions; idx++) { + regions[idx].address_offset = + mapped_address - base_address + + regions[idx].userspace_address - + regions[idx].guest_phys_address; + } + + ops->set_mem_table(ctx, ®ions[0], nregions); + return 0; +} diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h new file mode 100644 index 0000000..6f98ce8 --- /dev/null +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h @@ -0,0 +1,43 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _VIRTIO_NET_CDEV_H +#define _VIRTIO_NET_CDEV_H +#include <stdint.h> + +#include "vhost-net.h" + +int +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, + uint32_t nregions); + +#endif diff --git a/lib/librte_vhost/vhost-net-cdev.c b/lib/librte_vhost/vhost-net-cdev.c deleted file mode 100644 index 57c76cb..0000000 --- a/lib/librte_vhost/vhost-net-cdev.c +++ /dev/null @@ -1,389 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <errno.h> -#include <fuse/cuse_lowlevel.h> -#include <linux/limits.h> -#include <linux/vhost.h> -#include <stdint.h> -#include <string.h> -#include <unistd.h> - -#include <rte_ethdev.h> -#include <rte_log.h> -#include <rte_string_fns.h> -#include <rte_virtio_net.h> - -#include "vhost-net-cdev.h" - -#define FUSE_OPT_DUMMY "\0\0" -#define FUSE_OPT_FORE "-f\0\0" -#define FUSE_OPT_NOMULTI "-s\0\0" - -static const uint32_t default_major = 231; -static const uint32_t default_minor = 1; -static const char cuse_device_name[] = "/dev/cuse"; -static const char default_cdev[] = "vhost-net"; - -static struct fuse_session *session; -static struct vhost_net_device_ops const *ops; - -/* - * Returns vhost_device_ctx from given fuse_req_t. The index is populated later - * when the device is added to the device linked list. - */ -static struct vhost_device_ctx -fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) -{ - struct vhost_device_ctx ctx; - struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); - - ctx.pid = req_ctx->pid; - ctx.fh = fi->fh; - - return ctx; -} - -/* - * When the device is created in QEMU it gets initialised here and - * added to the device linked list. - */ -static void -vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) -{ - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - int err = 0; - - err = ops->new_device(ctx); - if (err == -1) { - fuse_reply_err(req, EPERM); - return; - } - - fi->fh = err; - - RTE_LOG(INFO, VHOST_CONFIG, - "(%"PRIu64") Device configuration started\n", fi->fh); - fuse_reply_open(req, fi); -} - -/* - * When QEMU is shutdown or killed the device gets released. - */ -static void -vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) -{ - int err = 0; - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - - ops->destroy_device(ctx); - RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); - fuse_reply_err(req, err); -} - -/* - * Boilerplate code for CUSE IOCTL - * Implicit arguments: ctx, req, result. - */ -#define VHOST_IOCTL(func) do { \ - result = (func)(ctx); \ - fuse_reply_ioctl(req, result, NULL, 0); \ -} while (0) - -/* - * Boilerplate IOCTL RETRY - * Implicit arguments: req. - */ -#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ - struct iovec iov_r = { arg, (size_r) }; \ - struct iovec iov_w = { arg, (size_w) }; \ - fuse_reply_ioctl_retry(req, &iov_r, \ - (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ -} while (0) - -/* - * Boilerplate code for CUSE Read IOCTL - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. - */ -#define VHOST_IOCTL_R(type, var, func) do { \ - if (!in_bufsz) { \ - VHOST_IOCTL_RETRY(sizeof(type), 0);\ - } else { \ - (var) = *(const type*)in_buf; \ - result = func(ctx, &(var)); \ - fuse_reply_ioctl(req, result, NULL, 0);\ - } \ -} while (0) - -/* - * Boilerplate code for CUSE Write IOCTL - * Implicit arguments: ctx, req, result, out_bufsz. - */ -#define VHOST_IOCTL_W(type, var, func) do { \ - if (!out_bufsz) { \ - VHOST_IOCTL_RETRY(0, sizeof(type));\ - } else { \ - result = (func)(ctx, &(var));\ - fuse_reply_ioctl(req, result, &(var), sizeof(type));\ - } \ -} while (0) - -/* - * Boilerplate code for CUSE Read/Write IOCTL - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. - */ -#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ - if (!in_bufsz) { \ - VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ - } else { \ - (var1) = *(const type1*) (in_buf); \ - result = (func)(ctx, (var1), &(var2)); \ - fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ - } \ -} while (0) - -/* - * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on the type - * of IOCTL a buffer is requested to read or to write. This request is handled - * by FUSE and the buffer is then given to CUSE. - */ -static void -vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, - struct fuse_file_info *fi, __rte_unused unsigned flags, - const void *in_buf, size_t in_bufsz, size_t out_bufsz) -{ - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - struct vhost_vring_file file; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - uint64_t features; - uint32_t index; - int result = 0; - - switch (cmd) { - case VHOST_NET_SET_BACKEND: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); - break; - - case VHOST_GET_FEATURES: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); - VHOST_IOCTL_W(uint64_t, features, ops->get_features); - break; - - case VHOST_SET_FEATURES: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); - VHOST_IOCTL_R(uint64_t, features, ops->set_features); - break; - - case VHOST_RESET_OWNER: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); - VHOST_IOCTL(ops->reset_owner); - break; - - case VHOST_SET_OWNER: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); - VHOST_IOCTL(ops->set_owner); - break; - - case VHOST_SET_MEM_TABLE: - /*TODO fix race condition.*/ - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); - static struct vhost_memory mem_temp; - - switch (in_bufsz) { - case 0: - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); - break; - - case sizeof(struct vhost_memory): - mem_temp = *(const struct vhost_memory *) in_buf; - - if (mem_temp.nregions > 0) { - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + - (sizeof(struct vhost_memory_region) * - mem_temp.nregions), 0); - } else { - result = -1; - fuse_reply_ioctl(req, result, NULL, 0); - } - break; - - default: - result = ops->set_mem_table(ctx, - in_buf, mem_temp.nregions); - if (result) - fuse_reply_err(req, EINVAL); - else - fuse_reply_ioctl(req, result, NULL, 0); - } - break; - - case VHOST_SET_VRING_NUM: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_state, state, - ops->set_vring_num); - break; - - case VHOST_SET_VRING_BASE: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_state, state, - ops->set_vring_base); - break; - - case VHOST_GET_VRING_BASE: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); - VHOST_IOCTL_RW(uint32_t, index, - struct vhost_vring_state, state, ops->get_vring_base); - break; - - case VHOST_SET_VRING_ADDR: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_addr, addr, - ops->set_vring_addr); - break; - - case VHOST_SET_VRING_KICK: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_KICK\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_file, file, - ops->set_vring_kick); - break; - - case VHOST_SET_VRING_CALL: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_CALL\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_file, file, - ops->set_vring_call); - break; - - default: - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); - result = -1; - fuse_reply_ioctl(req, result, NULL, 0); - } - - if (result < 0) - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); - else - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); -} - -/* - * Structure handling open, release and ioctl function pointers is populated. - */ -static const struct cuse_lowlevel_ops vhost_net_ops = { - .open = vhost_net_open, - .release = vhost_net_release, - .ioctl = vhost_net_ioctl, -}; - -/* - * cuse_info is populated and used to register the cuse device. - * vhost_net_device_ops are also passed when the device is registered in app. - */ -int -rte_vhost_driver_register(const char *dev_name) -{ - struct cuse_info cuse_info; - char device_name[PATH_MAX] = ""; - char char_device_name[PATH_MAX] = ""; - const char *device_argv[] = { device_name }; - - char fuse_opt_dummy[] = FUSE_OPT_DUMMY; - char fuse_opt_fore[] = FUSE_OPT_FORE; - char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; - char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; - - if (access(cuse_device_name, R_OK | W_OK) < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "char device %s can't be accessed, maybe not exist\n", - cuse_device_name); - return -1; - } - - /* - * The device name is created. This is passed to QEMU so that it can - * register the device with our application. - */ - snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); - snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); - - /* Check if device already exists. */ - if (access(char_device_name, F_OK) != -1) { - RTE_LOG(ERR, VHOST_CONFIG, - "char device %s already exists\n", char_device_name); - return -1; - } - - memset(&cuse_info, 0, sizeof(cuse_info)); - cuse_info.dev_major = default_major; - cuse_info.dev_minor = default_minor; - cuse_info.dev_info_argc = 1; - cuse_info.dev_info_argv = device_argv; - cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; - - ops = get_virtio_net_callbacks(); - - session = cuse_lowlevel_setup(3, fuse_argv, - &cuse_info, &vhost_net_ops, 0, NULL); - if (session == NULL) - return -1; - - return 0; -} - -/** - * The CUSE session is launched allowing the application to receive open, - * release and ioctl calls. - */ -int -rte_vhost_driver_session_start(void) -{ - fuse_session_loop(session); - - return 0; -} diff --git a/lib/librte_vhost/vhost-net-cdev.h b/lib/librte_vhost/vhost-net-cdev.h deleted file mode 100644 index 03a5c57..0000000 --- a/lib/librte_vhost/vhost-net-cdev.h +++ /dev/null @@ -1,113 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VHOST_NET_CDEV_H_ -#define _VHOST_NET_CDEV_H_ -#include <stdint.h> -#include <stdio.h> -#include <sys/types.h> -#include <unistd.h> -#include <linux/vhost.h> - -#include <rte_log.h> - -/* Macros for printing using RTE_LOG */ -#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 -#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 - -#ifdef RTE_LIBRTE_VHOST_DEBUG -#define VHOST_MAX_PRINT_BUFF 6072 -#define LOG_LEVEL RTE_LOG_DEBUG -#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) -#define PRINT_PACKET(device, addr, size, header) do { \ - char *pkt_addr = (char *)(addr); \ - unsigned int index; \ - char packet[VHOST_MAX_PRINT_BUFF]; \ - \ - if ((header)) \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ - else \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ - for (index = 0; index < (size); index++) { \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ - "%02hhx ", pkt_addr[index]); \ - } \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ - \ - LOG_DEBUG(VHOST_DATA, "%s", packet); \ -} while (0) -#else -#define LOG_LEVEL RTE_LOG_INFO -#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) -#define PRINT_PACKET(device, addr, size, header) do {} while (0) -#endif - - -/* - * Structure used to identify device context. - */ -struct vhost_device_ctx { - pid_t pid; /* PID of process calling the IOCTL. */ - uint64_t fh; /* Populated with fi->fh to track the device index. */ -}; - -/* - * Structure contains function pointers to be defined in virtio-net.c. These - * functions are called in CUSE context and are used to configure devices. - */ -struct vhost_net_device_ops { - int (*new_device)(struct vhost_device_ctx); - void (*destroy_device)(struct vhost_device_ctx); - - int (*get_features)(struct vhost_device_ctx, uint64_t *); - int (*set_features)(struct vhost_device_ctx, uint64_t *); - - int (*set_mem_table)(struct vhost_device_ctx, const void *, uint32_t); - - int (*set_vring_num)(struct vhost_device_ctx, struct vhost_vring_state *); - int (*set_vring_addr)(struct vhost_device_ctx, struct vhost_vring_addr *); - int (*set_vring_base)(struct vhost_device_ctx, struct vhost_vring_state *); - int (*get_vring_base)(struct vhost_device_ctx, uint32_t, struct vhost_vring_state *); - - int (*set_vring_kick)(struct vhost_device_ctx, struct vhost_vring_file *); - int (*set_vring_call)(struct vhost_device_ctx, struct vhost_vring_file *); - - int (*set_backend)(struct vhost_device_ctx, struct vhost_vring_file *); - - int (*set_owner)(struct vhost_device_ctx); - int (*reset_owner)(struct vhost_device_ctx); -}; - - -struct vhost_net_device_ops const *get_virtio_net_callbacks(void); -#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/librte_vhost/vhost-user/fd_man.c b/lib/librte_vhost/vhost-user/fd_man.c new file mode 100644 index 0000000..c7fd3f2 --- /dev/null +++ b/lib/librte_vhost/vhost-user/fd_man.c @@ -0,0 +1,158 @@ +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#include <rte_log.h> + +#include "fd_man.h" + +/** + * Returns the index in the fdset for a fd. + * If fd is -1, it means to search for a free entry. + * @return + * Index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++); + + return i == pfdset->num ? -1 : i; +} + +static int +fdset_find_free_slot(struct fdset *pfdset) +{ + return fdset_find_fd(pfdset, -1); + +} + +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, fd_cb rcb, + fd_cb wcb, uint64_t dat) +{ + struct fdentry *pfdentry = &pfdset->fd[idx]; + + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; +} + +/** + * Fill the read/write fdset with the fds in the fdset. + * @return + * the maximum fds filled in the read/write fd_set. + */ +static int +fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) +{ + struct fdentry *pfdentry; + int i, maxfds = -1; + int num = MAX_FDS; + + for (i = 0; i < num ; i++) { + pfdentry = &pfdset->fd[i]; + if (pfdentry->fd != -1) { + int added = 0; + if (pfdentry->rcb && rfset) { + FD_SET(pfdentry->fd, rfset); + added = 1; + } + if (pfdentry->wcb && wfset) { + FD_SET(pfdentry->fd, wfset); + added = 1; + } + if (added) + maxfds = pfdentry->fd < maxfds ? + maxfds : pfdentry->fd; + } + } + return maxfds; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + for (i = 0; i < MAX_FDS; i++) + pfdset->fd[i].fd = -1; + pfdset->num = MAX_FDS; + +} + +/** + * Register the fd in the fdset with its read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, uint64_t dat) +{ + int i; + + if (fd == -1) + return -1; + + /* Find a free slot in the list. */ + i = fdset_find_free_slot(pfdset); + if (i == -1) + return -2; + + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + + return 0; +} + +/** + * Unregister the fd from the fdset. + */ +void +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + + i = fdset_find_fd(pfdset, fd); + if (i != -1) { + pfdset->fd[i].fd = -1; + } +} + + +void +fdset_event_dispatch(struct fdset *pfdset) +{ + fd_set rfds,wfds; + int i, maxfds; + struct fdentry *pfdentry; + int num = MAX_FDS; + + if (pfdset == NULL) + return; + while (1) { + FD_ZERO(&rfds); + FD_ZERO(&wfds); + maxfds = fdset_fill(&rfds, &wfds, pfdset); + /* fd management runs in one thread */ + if (maxfds == -1) { + return; + } + + select(maxfds + 1, &rfds, &wfds, NULL, NULL); + + for (i = 0; i < num; i++) { + pfdentry = &pfdset->fd[i]; + if (FD_ISSET(pfdentry->fd, &rfds)) + pfdentry->rcb(pfdentry->fd, pfdentry->dat); + if (FD_ISSET(pfdentry->fd, &wfds)) + pfdentry->wcb(pfdentry->fd, pfdentry->dat); + } + + } +} diff --git a/lib/librte_vhost/vhost-user/fd_man.h b/lib/librte_vhost/vhost-user/fd_man.h new file mode 100644 index 0000000..57cc81d --- /dev/null +++ b/lib/librte_vhost/vhost-user/fd_man.h @@ -0,0 +1,31 @@ +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include <stdint.h> + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, uint64_t dat); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable.*/ + uint64_t dat; /* fd context */ +}; + +struct fdset { + struct fdentry fd[MAX_FDS]; + int num; +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, + fd_cb wcb, uint64_t ctx); + +void fdset_del(struct fdset *pfdset, int fd); + +void fdset_event_dispatch(struct fdset *pfdset); + +#endif diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.c b/lib/librte_vhost/vhost-user/vhost-net-user.c new file mode 100644 index 0000000..34450f4 --- /dev/null +++ b/lib/librte_vhost/vhost-user/vhost-net-user.c @@ -0,0 +1,417 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <limits.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <errno.h> + +#include <rte_log.h> +#include <rte_virtio_net.h> + +#include "fd_man.h" +#include "vhost-net-user.h" +#include "vhost-net.h" +#include "virtio-net-user.h" + +static void vserver_new_vq_conn(int fd, uint64_t data); +static void vserver_message_handler(int fd, uint64_t dat); +const struct vhost_net_device_ops *ops; + +static struct vhost_server *g_vhost_server; + +static const char *vhost_message_str[VHOST_USER_MAX] = +{ + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR" +}; + +/** + * Create a unix domain socket and bind to path. + * @return + * socket fd or -1 on failure + */ +static int +uds_socket(const char *path) +{ + struct sockaddr_un un; + int sockfd; + int ret; + + if (path == NULL) + return -1; + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd); + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + snprintf(un.sun_path, sizeof(un.sun_path), "%s", path); + ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un)); + if (ret == -1) + goto err; + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(sockfd, 1); + if (ret == -1) + goto err; + + return sockfd; + +err: + close(sockfd); + return -1; +} + + +/* return bytes# of read */ +static int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh = { 0 }; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); + return ret; + } + /* ret == buflen */ + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ( (cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + return ret; +} + +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "%s: invalid size:%d\n", __func__, msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret == 0) + return 0; + if (ret != (int)msg->size) { + printf("read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh = { 0 }; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + iov.iov_base = buf; + iov.iov_len = buflen; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return -1; + } + + return 0; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +/* call back when there is new connection. */ +static void +vserver_new_vq_conn(int fd, uint64_t dat) +{ + struct vhost_server *vserver = (void *)(uintptr_t)dat; + int conn_fd; + uint32_t fh; + struct vhost_device_ctx vdev_ctx = { 0 }; + + conn_fd = accept(fd, NULL, NULL); + RTE_LOG(INFO, VHOST_CONFIG, + "%s: new connection is %d\n", __func__, conn_fd); + if (conn_fd < 0) + return; + + fh = ops->new_device(vdev_ctx); + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh); + + fdset_add(&vserver->fdset, + conn_fd, vserver_message_handler, NULL, fh); +} + +/* callback when there is message on the connfd */ +static void +vserver_message_handler(int connfd, uint64_t dat) +{ + struct vhost_device_ctx ctx; + uint32_t fh = (uint32_t)dat; + struct VhostUserMsg msg; + uint64_t features; + int ret; + + ctx.fh = fh; + ret = read_vhost_message(connfd, &msg); + if (ret < 0) { + printf("vhost read message failed\n"); + + /*TODO: cleanup */ + close(connfd); + fdset_del(&g_vhost_server->fdset, connfd); + ops->destroy_device(ctx); + + return; + } else if (ret == 0) { + /*TODO: cleanup */ + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + close(connfd); + fdset_del(&g_vhost_server->fdset, connfd); + ops->destroy_device(ctx); + + return; + } + if (msg.request > VHOST_USER_MAX) { + /*TODO: cleanup */ + RTE_LOG(INFO, VHOST_CONFIG, + "vhost read incorrect message\n"); + close(connfd); + fdset_del(&g_vhost_server->fdset, connfd); + + return; + } + + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", + vhost_message_str[msg.request]); + switch (msg.request) { + case VHOST_USER_GET_FEATURES: + ret = ops->get_features(ctx, &features); + msg.payload.u64 = ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(connfd, &msg); + break; + case VHOST_USER_SET_FEATURES: + ops->set_features(ctx, &features); + break; + + case VHOST_USER_SET_OWNER: + ops->set_owner(ctx); + break; + case VHOST_USER_RESET_OWNER: + ops->reset_owner(ctx); + break; + + case VHOST_USER_SET_MEM_TABLE: + user_set_mem_table(ctx, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + case VHOST_USER_SET_LOG_FD: + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + ops->set_vring_num(ctx, &msg.payload.state); + break; + case VHOST_USER_SET_VRING_ADDR: + ops->set_vring_addr(ctx, &msg.payload.addr); + break; + case VHOST_USER_SET_VRING_BASE: + ops->set_vring_base(ctx, &msg.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + ret = ops->get_vring_base(ctx, msg.payload.state.index, + &msg.payload.state); + msg.size = sizeof(msg.payload.state); + send_vhost_message(connfd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + user_set_vring_kick(ctx, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + user_set_vring_call(ctx, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + default: + break; + + } +} + + +/** + * Creates and initialise the vhost server. + */ +int +rte_vhost_driver_register(const char *path) +{ + + struct vhost_server *vserver; + + if (g_vhost_server != NULL) + return -1; + + vserver = calloc(sizeof(struct vhost_server), 1); + /*TODO: all allocation is through DPDK memory allocation */ + if (vserver == NULL) + return -1; + + fdset_init(&vserver->fdset); + + unlink(path); + + vserver->listenfd = uds_socket(path); + if (vserver->listenfd < 0) { + free(vserver); + return -1; + } + vserver->path = path; + + fdset_add(&vserver->fdset, vserver->listenfd, + vserver_new_vq_conn, NULL, + (uint64_t)(uintptr_t)vserver); + + ops = get_virtio_net_callbacks(); + + g_vhost_server = vserver; + + return 0; +} + + +int +rte_vhost_driver_session_start(void) +{ + fdset_event_dispatch(&g_vhost_server->fdset); + return 0; +} + diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.h b/lib/librte_vhost/vhost-user/vhost-net-user.h new file mode 100644 index 0000000..c9df9fa --- /dev/null +++ b/lib/librte_vhost/vhost-user/vhost-net-user.h @@ -0,0 +1,74 @@ +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H +#include <stdint.h> +#include <linux/vhost.h> + +#include "fd_man.h" + +struct vhost_server { + const char *path; /**< The path the uds is bind to. */ + int listenfd; /**< The listener sockfd. */ + struct fdset fdset; /**< The fd list this vhost server manages. */ +}; + +/*********** FROM hw/virtio/vhost-user.c *************************************/ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute__((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION (0x1) + +/*****************************************************************************/ +#endif diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.c b/lib/librte_vhost/vhost-user/virtio-net-user.c new file mode 100644 index 0000000..f38e6cc --- /dev/null +++ b/lib/librte_vhost/vhost-user/virtio-net-user.c @@ -0,0 +1,208 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <rte_log.h> + +#include "virtio-net-user.h" +#include "vhost-net-user.h" +#include "vhost-net.h" + +extern const struct vhost_net_device_ops *ops; + +#if 0 +int +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + unsigned int idx; + struct VhostUserMemory memory = pmsg->payload.memory; + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; + uint64_t mapped_address, base_address = 0, mem_size = 0; + + for (idx = 0; idx < memory.nregions; idx++) { + if (memory.regions[idx].guest_phys_addr == 0) + base_address = memory.regions[idx].userspace_addr; + } + if (base_address == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't find the mem region whose gpa is 0.\n"); + return -1; + } + + for (idx = 0; idx < memory.nregions; idx++) { + uint64_t size = memory.regions[idx].userspace_addr - + base_address + memory.regions[idx].memory_size; + if (mem_size < size) + mem_size = size; + } + + /* + * here we assume qemu will map only one file for memory allocation, + * we only use fds[0] with offset 0. + */ + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mem_size, + PROT_READ | PROT_WRITE, MAP_SHARED, pmsg->fds[0], 0); + + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); + return -1; + } + + for (idx = 0; idx < memory.nregions; idx++) { + regions[idx].guest_phys_address = + memory.regions[idx].guest_phys_addr; + regions[idx].guest_phys_address_end = + memory.regions[idx].guest_phys_addr + + memory.regions[idx].memory_size; + regions[idx].memory_size = memory.regions[idx].memory_size; + regions[idx].userspace_address = + memory.regions[idx].userspace_addr; + + regions[idx].address_offset = mapped_address - base_address + + regions[idx].userspace_address - + regions[idx].guest_phys_address; + LOG_DEBUG(VHOST_CONFIG, + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", + idx, + (void *)(uintptr_t)regions[idx].guest_phys_address, + (void *)(uintptr_t)regions[idx].userspace_address, + regions[idx].memory_size); + } + ops->set_mem_table(ctx, regions, memory.nregions); + return 0; +} + +#else + +int +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + unsigned int idx; + struct VhostUserMemory memory = pmsg->payload.memory; + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; + uint64_t mapped_address, base_address = 0; + + for (idx = 0; idx < memory.nregions; idx++) { + if (memory.regions[idx].guest_phys_addr == 0) + base_address = memory.regions[idx].userspace_addr; + } + if (base_address == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't find the mem region whose gpa is 0.\n"); + return -1; + } + + + for (idx = 0; idx < memory.nregions; idx++) { + regions[idx].guest_phys_address = + memory.regions[idx].guest_phys_addr; + regions[idx].guest_phys_address_end = + memory.regions[idx].guest_phys_addr + + memory.regions[idx].memory_size; + regions[idx].memory_size = memory.regions[idx].memory_size; + regions[idx].userspace_address = + memory.regions[idx].userspace_addr; +/* + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, + regions[idx].memory_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + pmsg->fds[idx], + memory.regions[idx].mmap_offset); +*/ + +/* This is ugly */ + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, + regions[idx].memory_size + + memory.regions[idx].mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, + pmsg->fds[idx], + 0); + printf("mapped to %p\n", (void *)mapped_address); + + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); + return -1; + } + +// printf("ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset + 0x3FFFFFFF) & ~0x3FFFFFFF)); +// printf("unaligned ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset ) )); + mapped_address += memory.regions[idx].mmap_offset; + + regions[idx].address_offset = mapped_address - + regions[idx].guest_phys_address; + LOG_DEBUG(VHOST_CONFIG, + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", + idx, + (void *)(uintptr_t)regions[idx].guest_phys_address, + (void *)(uintptr_t)regions[idx].userspace_address, + regions[idx].memory_size); + } + ops->set_mem_table(ctx, regions, memory.nregions); + return 0; +} + + + + +#endif + + +void +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + ops->set_vring_call(ctx, &file); +} + + +void +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + ops->set_vring_kick(ctx, &file); +} diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.h b/lib/librte_vhost/vhost-user/virtio-net-user.h new file mode 100644 index 0000000..0969376 --- /dev/null +++ b/lib/librte_vhost/vhost-user/virtio-net-user.h @@ -0,0 +1,11 @@ +#ifndef _VIRTIO_NET_USER_H +#define _VIRTIO_NET_USER_H + +#include "vhost-net.h" +#include "vhost-net-user.h" + +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *); +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *); +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *); + +#endif diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index ccfd82f..8ff0301 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -38,19 +38,14 @@ #include <rte_memcpy.h> #include <rte_virtio_net.h> -#include "vhost-net-cdev.h" +#include "vhost-net.h" -#define MAX_PKT_BURST 32 +#define VHOST_MAX_PKT_BURST 64 +#define VHOST_MAX_MRG_PKT_BURST 64 -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtio device. A packet - * count is returned to indicate the number of packets that are succesfully - * added to the RX queue. This function works when mergeable is disabled. - */ -static inline uint32_t __attribute__((always_inline)) -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) + +uint32_t +rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count) { struct vhost_virtqueue *vq; struct vring_desc *desc; @@ -59,26 +54,23 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; uint64_t buff_addr = 0; uint64_t buff_hdr_addr = 0; - uint32_t head[MAX_PKT_BURST], packet_len = 0; + uint32_t head[VHOST_MAX_PKT_BURST], packet_len = 0; uint32_t head_idx, packet_success = 0; + uint32_t mergeable, mrg_count = 0; uint16_t avail_idx, res_cur_idx; uint16_t res_base_idx, res_end_idx; uint16_t free_entries; uint8_t success = 0; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s()\n", dev->device_fh, __func__); if (unlikely(queue_id != VIRTIO_RXQ)) { LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); return 0; } vq = dev->virtqueue[VIRTIO_RXQ]; - count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; - - /* - * As many data cores may want access to available buffers, - * they need to be reserved. - */ + count = (count > VHOST_MAX_PKT_BURST) ? VHOST_MAX_PKT_BURST : count; + /* As many data cores may want access to available buffers, they need to be reserved. */ do { res_base_idx = vq->last_used_idx_res; avail_idx = *((volatile uint16_t *)&vq->avail->idx); @@ -93,21 +85,25 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, res_end_idx = res_base_idx + count; /* vq->last_used_idx_res is atomically updated. */ - /* TODO: Allow to disable cmpset if no concurrency in application. */ + /* TODO: Allow to disable cmpset if no concurrency in application */ success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx, res_end_idx); + /* If there is contention here and failed, try again. */ } while (unlikely(success == 0)); res_cur_idx = res_base_idx; LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", - dev->device_fh, res_cur_idx, res_end_idx); + dev->device_fh, + res_cur_idx, res_end_idx); /* Prefetch available ring to retrieve indexes. */ rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); + /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ + mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); + /* Retrieve all of the head indexes first to avoid caching issues. */ for (head_idx = 0; head_idx < count; head_idx++) - head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & - (vq->size - 1)]; + head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; /*Prefetch descriptor index. */ rte_prefetch0(&vq->desc[head[packet_success]]); @@ -123,46 +119,57 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, /* Prefetch buffer address. */ rte_prefetch0((void *)(uintptr_t)buff_addr); - /* Copy virtio_hdr to packet and increment buffer address */ - buff_hdr_addr = buff_addr; - packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; - - /* - * If the descriptors are chained the header and data are - * placed in separate buffers. - */ - if (desc->flags & VRING_DESC_F_NEXT) { - desc->len = vq->vhost_hlen; - desc = &vq->desc[desc->next]; - /* Buffer address translation. */ - buff_addr = gpa_to_vva(dev, desc->addr); - desc->len = rte_pktmbuf_data_len(buff); + if (mergeable && (mrg_count != 0)) { + desc->len = packet_len = rte_pktmbuf_data_len(buff); } else { - buff_addr += vq->vhost_hlen; - desc->len = packet_len; + /* Copy virtio_hdr to packet and increment buffer address */ + buff_hdr_addr = buff_addr; + packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; + + /* + * If the descriptors are chained the header and data are placed in + * separate buffers. + */ + if (desc->flags & VRING_DESC_F_NEXT) { + desc->len = vq->vhost_hlen; + desc = &vq->desc[desc->next]; + /* Buffer address translation. */ + buff_addr = gpa_to_vva(dev, desc->addr); + desc->len = rte_pktmbuf_data_len(buff); + } else { + buff_addr += vq->vhost_hlen; + desc->len = packet_len; + } } + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0); + /* Update used ring with desc information */ - vq->used->ring[res_cur_idx & (vq->size - 1)].id = - head[packet_success]; + vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; /* Copy mbuf data to buffer */ - /* FIXME for sg mbuf and the case that desc couldn't hold the mbuf data */ - rte_memcpy((void *)(uintptr_t)buff_addr, - rte_pktmbuf_mtod(buff, const void *), - rte_pktmbuf_data_len(buff)); - PRINT_PACKET(dev, (uintptr_t)buff_addr, - rte_pktmbuf_data_len(buff), 0); + /* TODO fixme for sg mbuf and the case that desc couldn't hold the mbuf data */ + rte_memcpy((void *)(uintptr_t)buff_addr, (const void *)buff->pkt.data, rte_pktmbuf_data_len(buff)); res_cur_idx++; packet_success++; - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, - (const void *)&virtio_hdr, vq->vhost_hlen); - - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); - + /* If mergeable is disabled then a header is required per buffer. */ + if (!mergeable) { + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); + } else { + mrg_count++; + /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */ + if ((mrg_count == VHOST_MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) { + virtio_hdr.num_buffers = mrg_count; + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); + mrg_count = 0; + } + } if (res_cur_idx < res_end_idx) { /* Prefetch descriptor index. */ rte_prefetch0(&vq->desc[head[packet_success]]); @@ -184,357 +191,18 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, return count; } -static inline uint32_t __attribute__((always_inline)) -copy_from_mbuf_to_vring(struct virtio_net *dev, uint16_t res_base_idx, - uint16_t res_end_idx, struct rte_mbuf *pkt) -{ - uint32_t vec_idx = 0; - uint32_t entry_success = 0; - struct vhost_virtqueue *vq; - /* The virtio_hdr is initialised to 0. */ - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = { - {0, 0, 0, 0, 0, 0}, 0}; - uint16_t cur_idx = res_base_idx; - uint64_t vb_addr = 0; - uint64_t vb_hdr_addr = 0; - uint32_t seg_offset = 0; - uint32_t vb_offset = 0; - uint32_t seg_avail; - uint32_t vb_avail; - uint32_t cpy_len, entry_len; - - if (pkt == NULL) - return 0; - - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " - "End Index %d\n", - dev->device_fh, cur_idx, res_end_idx); - - /* - * Convert from gpa to vva - * (guest physical addr -> vhost virtual addr) - */ - vq = dev->virtqueue[VIRTIO_RXQ]; - vb_addr = - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); - vb_hdr_addr = vb_addr; - - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); - - virtio_hdr.num_buffers = res_end_idx - res_base_idx; - - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", - dev->device_fh, virtio_hdr.num_buffers); - rte_memcpy((void *)(uintptr_t)vb_hdr_addr, - (const void *)&virtio_hdr, vq->vhost_hlen); - - PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); - - seg_avail = rte_pktmbuf_data_len(pkt); - vb_offset = vq->vhost_hlen; - vb_avail = - vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; - - entry_len = vq->vhost_hlen; - - if (vb_avail == 0) { - uint32_t desc_idx = - vq->buf_vec[vec_idx].desc_idx; - vq->desc[desc_idx].len = vq->vhost_hlen; - - if ((vq->desc[desc_idx].flags - & VRING_DESC_F_NEXT) == 0) { - /* Update used ring with desc information */ - vq->used->ring[cur_idx & (vq->size - 1)].id - = vq->buf_vec[vec_idx].desc_idx; - vq->used->ring[cur_idx & (vq->size - 1)].len - = entry_len; - - entry_len = 0; - cur_idx++; - entry_success++; - } - - vec_idx++; - vb_addr = - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); - - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); - vb_offset = 0; - vb_avail = vq->buf_vec[vec_idx].buf_len; - } - - cpy_len = RTE_MIN(vb_avail, seg_avail); - - while (cpy_len > 0) { - /* Copy mbuf data to vring buffer */ - rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), - (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset), - cpy_len); - - PRINT_PACKET(dev, - (uintptr_t)(vb_addr + vb_offset), - cpy_len, 0); - - seg_offset += cpy_len; - vb_offset += cpy_len; - seg_avail -= cpy_len; - vb_avail -= cpy_len; - entry_len += cpy_len; - - if (seg_avail != 0) { - /* - * The virtio buffer in this vring - * entry reach to its end. - * But the segment doesn't complete. - */ - if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & - VRING_DESC_F_NEXT) == 0) { - /* Update used ring with desc information */ - vq->used->ring[cur_idx & (vq->size - 1)].id - = vq->buf_vec[vec_idx].desc_idx; - vq->used->ring[cur_idx & (vq->size - 1)].len - = entry_len; - entry_len = 0; - cur_idx++; - entry_success++; - } - - vec_idx++; - vb_addr = gpa_to_vva(dev, - vq->buf_vec[vec_idx].buf_addr); - vb_offset = 0; - vb_avail = vq->buf_vec[vec_idx].buf_len; - cpy_len = RTE_MIN(vb_avail, seg_avail); - } else { - /* - * This current segment complete, need continue to - * check if the whole packet complete or not. - */ - pkt = pkt->next; - if (pkt != NULL) { - /* - * There are more segments. - */ - if (vb_avail == 0) { - /* - * This current buffer from vring is - * used up, need fetch next buffer - * from buf_vec. - */ - uint32_t desc_idx = - vq->buf_vec[vec_idx].desc_idx; - vq->desc[desc_idx].len = vb_offset; - - if ((vq->desc[desc_idx].flags & - VRING_DESC_F_NEXT) == 0) { - uint16_t wrapped_idx = - cur_idx & (vq->size - 1); - /* - * Update used ring with the - * descriptor information - */ - vq->used->ring[wrapped_idx].id - = desc_idx; - vq->used->ring[wrapped_idx].len - = entry_len; - entry_success++; - entry_len = 0; - cur_idx++; - } - - /* Get next buffer from buf_vec. */ - vec_idx++; - vb_addr = gpa_to_vva(dev, - vq->buf_vec[vec_idx].buf_addr); - vb_avail = - vq->buf_vec[vec_idx].buf_len; - vb_offset = 0; - } - - seg_offset = 0; - seg_avail = rte_pktmbuf_data_len(pkt); - cpy_len = RTE_MIN(vb_avail, seg_avail); - } else { - /* - * This whole packet completes. - */ - uint32_t desc_idx = - vq->buf_vec[vec_idx].desc_idx; - vq->desc[desc_idx].len = vb_offset; - - while (vq->desc[desc_idx].flags & - VRING_DESC_F_NEXT) { - desc_idx = vq->desc[desc_idx].next; - vq->desc[desc_idx].len = 0; - } - - /* Update used ring with desc information */ - vq->used->ring[cur_idx & (vq->size - 1)].id - = vq->buf_vec[vec_idx].desc_idx; - vq->used->ring[cur_idx & (vq->size - 1)].len - = entry_len; - entry_len = 0; - cur_idx++; - entry_success++; - seg_avail = 0; - cpy_len = RTE_MIN(vb_avail, seg_avail); - } - } - } - - return entry_success; -} - -/* - * This function works for mergeable RX. - */ -static inline uint32_t __attribute__((always_inline)) -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) +uint32_t +rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) { - struct vhost_virtqueue *vq; - uint32_t pkt_idx = 0, entry_success = 0; - uint16_t avail_idx, res_cur_idx; - uint16_t res_base_idx, res_end_idx; - uint8_t success = 0; - - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", - dev->device_fh); - if (unlikely(queue_id != VIRTIO_RXQ)) { - LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); - } - - vq = dev->virtqueue[VIRTIO_RXQ]; - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); - - if (count == 0) - return 0; - - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t secure_len = 0; - uint16_t need_cnt; - uint32_t vec_idx = 0; - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen; - uint16_t i, id; - - do { - /* - * As many data cores may want access to available - * buffers, they need to be reserved. - */ - res_base_idx = vq->last_used_idx_res; - res_cur_idx = res_base_idx; - - do { - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - if (unlikely(res_cur_idx == avail_idx)) { - LOG_DEBUG(VHOST_DATA, - "(%"PRIu64") Failed " - "to get enough desc from " - "vring\n", - dev->device_fh); - return pkt_idx; - } else { - uint16_t wrapped_idx = - (res_cur_idx) & (vq->size - 1); - uint32_t idx = - vq->avail->ring[wrapped_idx]; - uint8_t next_desc; - - do { - next_desc = 0; - secure_len += vq->desc[idx].len; - if (vq->desc[idx].flags & - VRING_DESC_F_NEXT) { - idx = vq->desc[idx].next; - next_desc = 1; - } - } while (next_desc); - - res_cur_idx++; - } - } while (pkt_len > secure_len); - - /* vq->last_used_idx_res is atomically updated. */ - success = rte_atomic16_cmpset(&vq->last_used_idx_res, - res_base_idx, - res_cur_idx); - } while (success == 0); - - id = res_base_idx; - need_cnt = res_cur_idx - res_base_idx; - - for (i = 0; i < need_cnt; i++, id++) { - uint16_t wrapped_idx = id & (vq->size - 1); - uint32_t idx = vq->avail->ring[wrapped_idx]; - uint8_t next_desc; - do { - next_desc = 0; - vq->buf_vec[vec_idx].buf_addr = - vq->desc[idx].addr; - vq->buf_vec[vec_idx].buf_len = - vq->desc[idx].len; - vq->buf_vec[vec_idx].desc_idx = idx; - vec_idx++; - - if (vq->desc[idx].flags & VRING_DESC_F_NEXT) { - idx = vq->desc[idx].next; - next_desc = 1; - } - } while (next_desc); - } - - res_end_idx = res_cur_idx; - - entry_success = copy_from_mbuf_to_vring(dev, res_base_idx, - res_end_idx, pkts[pkt_idx]); - - rte_compiler_barrier(); - - /* - * Wait until it's our turn to add our buffer - * to the used ring. - */ - while (unlikely(vq->last_used_idx != res_base_idx)) - rte_pause(); - - *(volatile uint16_t *)&vq->used->idx += entry_success; - vq->last_used_idx = res_end_idx; - - /* Kick the guest if necessary. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) - eventfd_write((int)vq->kickfd, 1); - } - - return count; -} - -uint16_t -rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count) -{ - if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))) - return virtio_dev_merge_rx(dev, queue_id, pkts, count); - else - return virtio_dev_rx(dev, queue_id, pkts, count); -} - -uint16_t -rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -{ - struct rte_mbuf *m, *prev; + struct rte_mbuf *mbuf; struct vhost_virtqueue *vq; struct vring_desc *desc; - uint64_t vb_addr = 0; - uint32_t head[MAX_PKT_BURST]; + uint64_t buff_addr = 0; + uint32_t head[VHOST_MAX_PKT_BURST]; uint32_t used_idx; uint32_t i; - uint16_t free_entries, entry_success = 0; + uint16_t free_entries, packet_success = 0; uint16_t avail_idx; if (unlikely(queue_id != VIRTIO_TXQ)) { @@ -549,8 +217,8 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, if (vq->last_used_idx == avail_idx) return 0; - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, - dev->device_fh); + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s(%d->%d)\n", + dev->device_fh, __func__, vq->last_used_idx, avail_idx); /* Prefetch available ring to retrieve head indexes. */ rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); @@ -558,173 +226,68 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, /*get the number of free entries in the ring*/ free_entries = (avail_idx - vq->last_used_idx); - free_entries = RTE_MIN(free_entries, count); + if (free_entries > count) + free_entries = count; /* Limit to MAX_PKT_BURST. */ - free_entries = RTE_MIN(free_entries, MAX_PKT_BURST); + if (free_entries > VHOST_MAX_PKT_BURST) + free_entries = VHOST_MAX_PKT_BURST; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", - dev->device_fh, free_entries); + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries); /* Retrieve all of the head indexes first to avoid caching issues. */ for (i = 0; i < free_entries; i++) head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[head[entry_success]]); + rte_prefetch0(&vq->desc[head[packet_success]]); rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); - while (entry_success < free_entries) { - uint32_t vb_avail, vb_offset; - uint32_t seg_avail, seg_offset; - uint32_t cpy_len; - uint32_t seg_num = 0; - struct rte_mbuf *cur; - uint8_t alloc_err = 0; - - desc = &vq->desc[head[entry_success]]; + while (packet_success < free_entries) { + desc = &vq->desc[head[packet_success]]; /* Discard first buffer as it is the virtio header */ desc = &vq->desc[desc->next]; /* Buffer address translation. */ - vb_addr = gpa_to_vva(dev, desc->addr); + buff_addr = gpa_to_vva(dev, desc->addr); /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); + rte_prefetch0((void *)(uintptr_t)buff_addr); used_idx = vq->last_used_idx & (vq->size - 1); - if (entry_success < (free_entries - 1)) { + if (packet_success < (free_entries - 1)) { /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[head[entry_success+1]]); + rte_prefetch0(&vq->desc[head[packet_success+1]]); rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); } /* Update used index buffer information. */ - vq->used->ring[used_idx].id = head[entry_success]; + vq->used->ring[used_idx].id = head[packet_success]; vq->used->ring[used_idx].len = 0; - vb_offset = 0; - vb_avail = desc->len; - /* Allocate an mbuf and populate the structure. */ - m = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(m == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - return entry_success; + mbuf = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(mbuf == NULL)) { + RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); + return packet_success; } - seg_offset = 0; - seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; - cpy_len = RTE_MIN(vb_avail, seg_avail); - - PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); - - seg_num++; - cur = m; - prev = m; - while (cpy_len != 0) { - rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset), - (void *)((uintptr_t)(vb_addr + vb_offset)), - cpy_len); - - seg_offset += cpy_len; - vb_offset += cpy_len; - vb_avail -= cpy_len; - seg_avail -= cpy_len; - - if (vb_avail != 0) { - /* - * The segment reachs to its end, - * while the virtio buffer in TX vring has - * more data to be copied. - */ - cur->data_len = seg_offset; - m->pkt_len += seg_offset; - /* Allocate mbuf and populate the structure. */ - cur = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(cur == NULL)) { - RTE_LOG(ERR, VHOST_DATA, "Failed to " - "allocate memory for mbuf.\n"); - rte_pktmbuf_free(m); - alloc_err = 1; - break; - } - - seg_num++; - prev->next = cur; - prev = cur; - seg_offset = 0; - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; - } else { - if (desc->flags & VRING_DESC_F_NEXT) { - /* - * There are more virtio buffers in - * same vring entry need to be copied. - */ - if (seg_avail == 0) { - /* - * The current segment hasn't - * room to accomodate more - * data. - */ - cur->data_len = seg_offset; - m->pkt_len += seg_offset; - /* - * Allocate an mbuf and - * populate the structure. - */ - cur = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(cur == NULL)) { - RTE_LOG(ERR, - VHOST_DATA, - "Failed to " - "allocate memory " - "for mbuf\n"); - rte_pktmbuf_free(m); - alloc_err = 1; - break; - } - seg_num++; - prev->next = cur; - prev = cur; - seg_offset = 0; - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; - } - - desc = &vq->desc[desc->next]; - - /* Buffer address translation. */ - vb_addr = gpa_to_vva(dev, desc->addr); - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); - vb_offset = 0; - vb_avail = desc->len; - - PRINT_PACKET(dev, (uintptr_t)vb_addr, - desc->len, 0); - } else { - /* The whole packet completes. */ - cur->data_len = seg_offset; - m->pkt_len += seg_offset; - vb_avail = 0; - } - } + mbuf->pkt.data_len = desc->len; + mbuf->pkt.pkt_len = mbuf->pkt.data_len; - cpy_len = RTE_MIN(vb_avail, seg_avail); - } + rte_memcpy((void *) mbuf->pkt.data, + (const void *) buff_addr, mbuf->pkt.data_len); - if (unlikely(alloc_err == 1)) - break; + pkts[packet_success] = mbuf; - m->nb_segs = seg_num; + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); - pkts[entry_success] = m; vq->last_used_idx++; - entry_success++; + packet_success++; } rte_compiler_barrier(); - vq->used->idx += entry_success; + vq->used->idx += packet_success; /* Kick guest if required. */ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) eventfd_write((int)vq->kickfd, 1); - return entry_success; + + return packet_success; } diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c index 852b6d1..516e743 100644 --- a/lib/librte_vhost/virtio-net.c +++ b/lib/librte_vhost/virtio-net.c @@ -31,17 +31,14 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <dirent.h> -#include <fuse/cuse_lowlevel.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <stddef.h> #include <stdint.h> #include <stdlib.h> -#include <sys/eventfd.h> -#include <sys/ioctl.h> #include <sys/mman.h> #include <unistd.h> +#include <assert.h> #include <rte_ethdev.h> #include <rte_log.h> @@ -49,10 +46,8 @@ #include <rte_memory.h> #include <rte_virtio_net.h> -#include "vhost-net-cdev.h" -#include "eventfd_link/eventfd_link.h" - -/* +#include "vhost-net.h" +/** * Device linked list structure for configuration. */ struct virtio_net_config_ll { @@ -60,38 +55,15 @@ struct virtio_net_config_ll { struct virtio_net_config_ll *next; /* Next dev on linked list.*/ }; -const char eventfd_cdev[] = "/dev/eventfd-link"; - -/* device ops to add/remove device to/from data core. */ +/* device ops to add/remove device to data core. */ static struct virtio_net_device_ops const *notify_ops; -/* root address of the linked list of managed virtio devices */ +/* root address of the linked list in the configuration core. */ static struct virtio_net_config_ll *ll_root; /* Features supported by this lib. */ -#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ - (1ULL << VIRTIO_NET_F_CTRL_RX)) +#define VHOST_SUPPORTED_FEATURES (1ULL << VIRTIO_NET_F_MRG_RXBUF) static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; -/* Line size for reading maps file. */ -static const uint32_t BUFSIZE = PATH_MAX; - -/* Size of prot char array in procmap. */ -#define PROT_SZ 5 - -/* Number of elements in procmap struct. */ -#define PROCMAP_SZ 8 - -/* Structure containing information gathered from maps file. */ -struct procmap { - uint64_t va_start; /* Start virtual address in file. */ - uint64_t len; /* Size of file. */ - uint64_t pgoff; /* Not used. */ - uint32_t maj; /* Not used. */ - uint32_t min; /* Not used. */ - uint32_t ino; /* Not used. */ - char prot[PROT_SZ]; /* Not used. */ - char fname[PATH_MAX]; /* File name. */ -}; /* * Converts QEMU virtual address to Vhost virtual address. This function is @@ -110,199 +82,15 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) if ((qemu_va >= region->userspace_address) && (qemu_va <= region->userspace_address + region->memory_size)) { - vhost_va = dev->mem->mapped_address + qemu_va - - dev->mem->base_address; + vhost_va = qemu_va + region->guest_phys_address + + region->address_offset - + region->userspace_address; break; } } return vhost_va; } -/* - * Locate the file containing QEMU's memory space and - * map it to our address space. - */ -static int -host_memory_map(struct virtio_net *dev, struct virtio_memory *mem, - pid_t pid, uint64_t addr) -{ - struct dirent *dptr = NULL; - struct procmap procmap; - DIR *dp = NULL; - int fd; - int i; - char memfile[PATH_MAX]; - char mapfile[PATH_MAX]; - char procdir[PATH_MAX]; - char resolved_path[PATH_MAX]; - char *path = NULL; - FILE *fmap; - void *map; - uint8_t found = 0; - char line[BUFSIZE]; - char dlm[] = "- : "; - char *str, *sp, *in[PROCMAP_SZ]; - char *end = NULL; - - /* Path where mem files are located. */ - snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); - /* Maps file used to locate mem file. */ - snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); - - fmap = fopen(mapfile, "r"); - if (fmap == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to open maps file for pid %d\n", - dev->device_fh, pid); - return -1; - } - - /* Read through maps file until we find out base_address. */ - while (fgets(line, BUFSIZE, fmap) != 0) { - str = line; - errno = 0; - /* Split line into fields. */ - for (i = 0; i < PROCMAP_SZ; i++) { - in[i] = strtok_r(str, &dlm[i], &sp); - if ((in[i] == NULL) || (errno != 0)) { - fclose(fmap); - return -1; - } - str = NULL; - } - - /* Convert/Copy each field as needed. */ - procmap.va_start = strtoull(in[0], &end, 16); - if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.len = strtoull(in[1], &end, 16); - if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.pgoff = strtoull(in[3], &end, 16); - if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.maj = strtoul(in[4], &end, 16); - if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.min = strtoul(in[5], &end, 16); - if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.ino = strtoul(in[6], &end, 16); - if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - memcpy(&procmap.prot, in[2], PROT_SZ); - memcpy(&procmap.fname, in[7], PATH_MAX); - - if (procmap.va_start == addr) { - procmap.len = procmap.len - procmap.va_start; - found = 1; - break; - } - } - fclose(fmap); - - if (!found) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to find memory file in pid %d maps file\n", - dev->device_fh, pid); - return -1; - } - - /* Find the guest memory file among the process fds. */ - dp = opendir(procdir); - if (dp == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Cannot open pid %d process directory\n", - dev->device_fh, pid); - return -1; - } - - found = 0; - - /* Read the fd directory contents. */ - while (NULL != (dptr = readdir(dp))) { - snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", - pid, dptr->d_name); - path = realpath(memfile, resolved_path); - if ((path == NULL) && (strlen(resolved_path) == 0)) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to resolve fd directory\n", - dev->device_fh); - closedir(dp); - return -1; - } - if (strncmp(resolved_path, procmap.fname, - strnlen(procmap.fname, PATH_MAX)) == 0) { - found = 1; - break; - } - } - - closedir(dp); - - if (found == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to find memory file for pid %d\n", - dev->device_fh, pid); - return -1; - } - /* Open the shared memory file and map the memory into this process. */ - fd = open(memfile, O_RDWR); - - if (fd == -1) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to open %s for pid %d\n", - dev->device_fh, memfile, pid); - return -1; - } - - map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE, - MAP_POPULATE|MAP_SHARED, fd, 0); - close(fd); - - if (map == MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Error mapping the file %s for pid %d\n", - dev->device_fh, memfile, pid); - return -1; - } - - /* Store the memory address and size in the device data structure */ - mem->mapped_address = (uint64_t)(uintptr_t)map; - mem->mapped_size = procmap.len; - - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") Mem File: %s->%s - Size: %llu - VA: %p\n", - dev->device_fh, - memfile, resolved_path, - (unsigned long long)mem->mapped_size, map); - - return 0; -} /* * Retrieves an entry from the devices configuration linked list. @@ -376,7 +164,7 @@ add_config_ll_entry(struct virtio_net_config_ll *new_ll_dev) } } - +/*TODO dpdk alloc/free if possible */ /* * Unmap any memory, close any file descriptors and * free any memory owned by a device. @@ -389,16 +177,17 @@ cleanup_device(struct virtio_net *dev) munmap((void *)(uintptr_t)dev->mem->mapped_address, (size_t)dev->mem->mapped_size); free(dev->mem); + dev->mem = NULL; } /* Close any event notifiers opened by device. */ - if (dev->virtqueue[VIRTIO_RXQ]->callfd) + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); - if (dev->virtqueue[VIRTIO_RXQ]->kickfd) + if (dev->virtqueue[VIRTIO_RXQ]->kickfd > 0) close((int)dev->virtqueue[VIRTIO_RXQ]->kickfd); - if (dev->virtqueue[VIRTIO_TXQ]->callfd) + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); - if (dev->virtqueue[VIRTIO_TXQ]->kickfd) + if (dev->virtqueue[VIRTIO_TXQ]->kickfd > 0) close((int)dev->virtqueue[VIRTIO_TXQ]->kickfd); } @@ -522,8 +311,8 @@ new_device(struct vhost_device_ctx ctx) } /* - * Function is called from the CUSE release function. This function will - * cleanup the device and remove it from device configuration linked list. + * Function is called from the CUSE release function. This function will cleanup + * the device and remove it from device configuration linked list. */ static void destroy_device(struct vhost_device_ctx ctx) @@ -569,6 +358,7 @@ set_owner(struct vhost_device_ctx ctx) return -1; return 0; + /* TODO check ctx.fh is meaningfull here */ } /* @@ -651,14 +441,12 @@ set_features(struct vhost_device_ctx ctx, uint64_t *pu) * This includes storing offsets used to translate buffer addresses. */ static int -set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, - uint32_t nregions) +set_mem_table(struct vhost_device_ctx ctx, + const struct virtio_memory_regions *regions, uint32_t nregions) { struct virtio_net *dev; - struct vhost_memory_region *mem_regions; struct virtio_memory *mem; - uint64_t size = offsetof(struct vhost_memory, regions); - uint32_t regionidx, valid_regions; + uint32_t regionidx; dev = get_device(ctx); if (dev == NULL) @@ -682,107 +470,24 @@ set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, mem->nregions = nregions; - mem_regions = (void *)(uintptr_t) - ((uint64_t)(uintptr_t)mem_regions_addr + size); - for (regionidx = 0; regionidx < mem->nregions; regionidx++) { /* Populate the region structure for each region. */ - mem->regions[regionidx].guest_phys_address = - mem_regions[regionidx].guest_phys_addr; - mem->regions[regionidx].guest_phys_address_end = - mem->regions[regionidx].guest_phys_address + - mem_regions[regionidx].memory_size; - mem->regions[regionidx].memory_size = - mem_regions[regionidx].memory_size; - mem->regions[regionidx].userspace_address = - mem_regions[regionidx].userspace_addr; - - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", dev->device_fh, - regionidx, - (void *)(uintptr_t)mem->regions[regionidx].guest_phys_address, - (void *)(uintptr_t)mem->regions[regionidx].userspace_address, - mem->regions[regionidx].memory_size); - - /*set the base address mapping*/ + mem->regions[regionidx] = regions[regionidx]; if (mem->regions[regionidx].guest_phys_address == 0x0) { mem->base_address = mem->regions[regionidx].userspace_address; - /* Map VM memory file */ - if (host_memory_map(dev, mem, ctx.pid, - mem->base_address) != 0) { - free(mem); - return -1; - } + mem->mapped_address = + mem->regions[regionidx].address_offset; } } - /* Check that we have a valid base address. */ - if (mem->base_address == 0) { - RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Failed to find base address of qemu memory file.\n", dev->device_fh); - free(mem); - return -1; - } - - /* - * Check if all of our regions have valid mappings. - * Usually one does not exist in the QEMU memory file. - */ - valid_regions = mem->nregions; - for (regionidx = 0; regionidx < mem->nregions; regionidx++) { - if ((mem->regions[regionidx].userspace_address < - mem->base_address) || - (mem->regions[regionidx].userspace_address > - (mem->base_address + mem->mapped_size))) - valid_regions--; - } - - /* - * If a region does not have a valid mapping, - * we rebuild our memory struct to contain only valid entries. - */ - if (valid_regions != mem->nregions) { - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") Not all memory regions exist in the QEMU mem file. Re-populating mem structure\n", - dev->device_fh); - - /* - * Re-populate the memory structure with only valid regions. - * Invalid regions are over-written with memmove. - */ - valid_regions = 0; - - for (regionidx = mem->nregions; 0 != regionidx--;) { - if ((mem->regions[regionidx].userspace_address < - mem->base_address) || - (mem->regions[regionidx].userspace_address > - (mem->base_address + mem->mapped_size))) { - memmove(&mem->regions[regionidx], - &mem->regions[regionidx + 1], - sizeof(struct virtio_memory_regions) * - valid_regions); - } else { - valid_regions++; - } - } - } - mem->nregions = valid_regions; + /*TODO addback the logic that remove invalid memory regions */ dev->mem = mem; - /* - * Calculate the address offset for each region. - * This offset is used to identify the vhost virtual address - * corresponding to a QEMU guest physical address. - */ - for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { - dev->mem->regions[regionidx].address_offset = - dev->mem->regions[regionidx].userspace_address - - dev->mem->base_address + - dev->mem->mapped_address - - dev->mem->regions[regionidx].guest_phys_address; - - } return 0; } + /* * Called from CUSE IOCTL: VHOST_SET_VRING_NUM * The virtio device sends us the size of the descriptor ring. @@ -896,38 +601,62 @@ get_vring_base(struct vhost_device_ctx ctx, uint32_t index, /* State->index refers to the queue index. The txq is 1, rxq is 0. */ state->num = dev->virtqueue[state->index]->last_used_idx; - return 0; -} + if (dev->flags & VIRTIO_DEV_RUNNING) { + RTE_LOG(INFO, VHOST_CONFIG, + "get_vring_base message is for release\n"); + notify_ops->destroy_device(dev); + /* + * sync call. + * when it returns, it means it si removed from data core. + */ + } + /* TODO fix all munmap */ + if (dev->mem) { + munmap((void *)(uintptr_t)dev->mem->mapped_address, + (size_t)dev->mem->mapped_size); + free(dev->mem); + dev->mem = NULL; + } -/* - * This function uses the eventfd_link kernel module to copy an eventfd file - * descriptor provided by QEMU in to our process space. - */ -static int -eventfd_copy(struct virtio_net *dev, struct eventfd_copy *eventfd_copy) -{ - int eventfd_link, ret; - /* Open the character device to the kernel module. */ - eventfd_link = open(eventfd_cdev, O_RDWR); - if (eventfd_link < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") eventfd_link module is not loaded\n", - dev->device_fh); - return -1; - } + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) + close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); + dev->virtqueue[VIRTIO_RXQ]->callfd = -1; + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) + close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); + dev->virtqueue[VIRTIO_TXQ]->callfd = -1; + /* We don't cleanup callfd here as we willn't get CALLFD again */ + + dev->virtqueue[VIRTIO_RXQ]->desc = NULL; + dev->virtqueue[VIRTIO_RXQ]->avail = NULL; + dev->virtqueue[VIRTIO_RXQ]->used = NULL; + dev->virtqueue[VIRTIO_RXQ]->last_used_idx = 0; + dev->virtqueue[VIRTIO_RXQ]->last_used_idx_res = 0; + + dev->virtqueue[VIRTIO_TXQ]->desc = NULL; + dev->virtqueue[VIRTIO_TXQ]->avail = NULL; + dev->virtqueue[VIRTIO_TXQ]->used = NULL; + dev->virtqueue[VIRTIO_TXQ]->last_used_idx = 0; + dev->virtqueue[VIRTIO_TXQ]->last_used_idx_res = 0; - /* Call the IOCTL to copy the eventfd. */ - ret = ioctl(eventfd_link, EVENTFD_COPY, eventfd_copy); - close(eventfd_link); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") EVENTFD_COPY ioctl failed\n", - dev->device_fh); - return -1; - } + return 0; +} +static int +virtio_is_ready(struct virtio_net *dev, int index) +{ + struct vhost_virtqueue *vq1, *vq2; + /* mq support in future.*/ + vq1 = dev->virtqueue[index]; + vq2 = dev->virtqueue[index ^ 1]; + if (vq1 && vq2 && vq1->desc && vq2->desc && + (vq1->kickfd > 0) && (vq1->callfd > 0) && + (vq2->kickfd > 0) && (vq2->callfd > 0)) { + LOG_DEBUG(VHOST_CONFIG, "virtio is ready for processing.\n"); + return 1; + } + LOG_DEBUG(VHOST_CONFIG, "virtio isn't ready for processing.\n"); return 0; } @@ -940,7 +669,6 @@ static int set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) { struct virtio_net *dev; - struct eventfd_copy eventfd_kick; struct vhost_virtqueue *vq; dev = get_device(ctx); @@ -953,14 +681,7 @@ set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) if (vq->kickfd) close((int)vq->kickfd); - /* Populate the eventfd_copy structure and call eventfd_copy. */ - vq->kickfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); - eventfd_kick.source_fd = vq->kickfd; - eventfd_kick.target_fd = file->fd; - eventfd_kick.target_pid = ctx.pid; - - if (eventfd_copy(dev, &eventfd_kick)) - return -1; + vq->kickfd = file->fd; return 0; } @@ -974,7 +695,6 @@ static int set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) { struct virtio_net *dev; - struct eventfd_copy eventfd_call; struct vhost_virtqueue *vq; dev = get_device(ctx); @@ -986,16 +706,11 @@ set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) if (vq->callfd) close((int)vq->callfd); + vq->callfd = file->fd; - /* Populate the eventfd_copy structure and call eventfd_copy. */ - vq->callfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); - eventfd_call.source_fd = vq->callfd; - eventfd_call.target_fd = file->fd; - eventfd_call.target_pid = ctx.pid; - - if (eventfd_copy(dev, &eventfd_call)) - return -1; - + if (virtio_is_ready(dev, file->index) && + !(dev->flags & VIRTIO_DEV_RUNNING)) + notify_ops->new_device(dev); return 0; } @@ -1024,6 +739,7 @@ set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file) * If the device isn't already running and both backend fds are set, * we add the device. */ + LOG_DEBUG(VHOST_CONFIG, "%s %d\n", __func__, file->fd); if (!(dev->flags & VIRTIO_DEV_RUNNING)) { if (((int)dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED) && ((int)dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED)) -- 1.8.1.4 ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [dpdk-dev] [PATCH RFC] lib/librte_vhost: vhost-user 2014-11-15 1:14 [dpdk-dev] [PATCH RFC] lib/librte_vhost: vhost-user Huawei Xie @ 2014-11-17 6:04 ` Tetsuya Mukawa 2014-11-17 6:11 ` Tetsuya Mukawa 2014-11-17 6:06 ` [dpdk-dev] [RFC PATCH] lib/librte_vhost: cleanup white spaces, tabs and indents Tetsuya Mukawa 2014-11-17 6:07 ` [dpdk-dev] [RFC PATCH 1/2] lib/librte_vhost: change macro name of include guard Tetsuya Mukawa 2 siblings, 1 reply; 6+ messages in thread From: Tetsuya Mukawa @ 2014-11-17 6:04 UTC (permalink / raw) To: Huawei Xie, dev Hi Xie, (2014/11/15 10:14), Huawei Xie wrote: > implement socket server > fd event dispatch mechanism > vhost sock message handling > memory map for each region > VHOST_USER_SET_VRING_KICK_FD as the indicator that vring is available > VHOST_USER_GET_VRING_BASE as the message that vring should be released > > The message flow between vhost-user and vhost-cuse is kindof different, > which makes virtio-net common message handler layer difficult and complicated to handle > both cases in new_device/destroy_device/memory map/resource cleanup. > > Will only leave the most common messag handling in virtio-net, and move the > control logic to cuse/fuse layer. > > > Signed-off-by: Huawei Xie <huawei.xie@intel.com> Great patch! I guess we can start from this patch to implement vhost-user and abstraction layer. I've checked patch. 1. White space, tab and indent patch. I will send patch that clears white space, tab and indent. Could you please check it? It might be difficult to see the difference, if your editor doesn't show a space or tab. 2. Some files are based on old codes. At least, following patch is not included. - vhost: fix build without unused result Also vhost_rxtx.c isn't probably based on latest code. 3. Device abstraction layer code I will send the device abstraction layer code after this email. Anyway, I guess we need to decide whether, or not we still keep vhost-cuse code 4. Multiple devices operation. For example, when thread1 opens vhost-user device1 and thread2 opens vhost-user device2, each thread may want to register own callbacks. Current implementation may not allow this. I guess we need to eliminate global variables in librte_vhost as much as possible. Thanks, Tetsuya > --- > lib/librte_vhost/Makefile | 14 +- > lib/librte_vhost/eventfd_link/eventfd_link.c | 27 +- > lib/librte_vhost/eventfd_link/eventfd_link.h | 48 +- > lib/librte_vhost/libvirt/qemu-wrap.py | 367 --------------- > lib/librte_vhost/rte_virtio_net.h | 106 ++--- > lib/librte_vhost/vhost-cuse/vhost-net-cdev.c | 436 ++++++++++++++++++ > lib/librte_vhost/vhost-cuse/virtio-net-cdev.c | 314 +++++++++++++ > lib/librte_vhost/vhost-cuse/virtio-net-cdev.h | 43 ++ > lib/librte_vhost/vhost-net-cdev.c | 389 ---------------- > lib/librte_vhost/vhost-net-cdev.h | 113 ----- > lib/librte_vhost/vhost-user/fd_man.c | 158 +++++++ > lib/librte_vhost/vhost-user/fd_man.h | 31 ++ > lib/librte_vhost/vhost-user/vhost-net-user.c | 417 +++++++++++++++++ > lib/librte_vhost/vhost-user/vhost-net-user.h | 74 +++ > lib/librte_vhost/vhost-user/virtio-net-user.c | 208 +++++++++ > lib/librte_vhost/vhost-user/virtio-net-user.h | 11 + > lib/librte_vhost/vhost_rxtx.c | 625 ++++---------------------- > lib/librte_vhost/virtio-net.c | 450 ++++--------------- > 18 files changed, 1939 insertions(+), 1892 deletions(-) > delete mode 100755 lib/librte_vhost/libvirt/qemu-wrap.py > create mode 100644 lib/librte_vhost/vhost-cuse/vhost-net-cdev.c > create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.c > create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.h > delete mode 100644 lib/librte_vhost/vhost-net-cdev.c > delete mode 100644 lib/librte_vhost/vhost-net-cdev.h > create mode 100644 lib/librte_vhost/vhost-user/fd_man.c > create mode 100644 lib/librte_vhost/vhost-user/fd_man.h > create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.c > create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.h > create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.c > create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.h > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > index c008d64..cb4e172 100644 > --- a/lib/librte_vhost/Makefile > +++ b/lib/librte_vhost/Makefile > @@ -34,17 +34,19 @@ include $(RTE_SDK)/mk/rte.vars.mk > # library name > LIB = librte_vhost.a > > -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 -lfuse > +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I. -I vhost-user -I vhost-cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse > LDFLAGS += -lfuse > # all source are stored in SRCS-y > -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-net-cdev.c virtio-net.c vhost_rxtx.c > +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-cuse/vhost-net-cdev.c vhost-cuse/virtio-net-cdev.c > + > +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-user/fd_man.c vhost-user/vhost-net-user.c vhost-user/virtio-net-user.c > + > +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net.c vhost_rxtx.c > > # install includes > SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h > > -# dependencies > -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal > -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_ether > -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mbuf > +# this lib needs eal > +DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal lib/librte_mbuf > > include $(RTE_SDK)/mk/rte.lib.mk > diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.c b/lib/librte_vhost/eventfd_link/eventfd_link.c > index 7755dd6..4c9b628 100644 > --- a/lib/librte_vhost/eventfd_link/eventfd_link.c > +++ b/lib/librte_vhost/eventfd_link/eventfd_link.c > @@ -13,8 +13,7 @@ > * General Public License for more details. > * > * You should have received a copy of the GNU General Public License > - * along with this program; if not, write to the Free Software > - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. > + * along with this program; If not, see <http://www.gnu.org/licenses/>. > * The full GNU General Public License is included in this distribution > * in the file called LICENSE.GPL. > * > @@ -78,8 +77,7 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) > > switch (ioctl) { > case EVENTFD_COPY: > - if (copy_from_user(&eventfd_copy, argp, > - sizeof(struct eventfd_copy))) > + if (copy_from_user(&eventfd_copy, argp, sizeof(struct eventfd_copy))) > return -EFAULT; > > /* > @@ -88,28 +86,28 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) > task_target = > pid_task(find_vpid(eventfd_copy.target_pid), PIDTYPE_PID); > if (task_target == NULL) { > - pr_debug("Failed to get mem ctx for target pid\n"); > + printk(KERN_DEBUG "Failed to get mem ctx for target pid\n"); > return -EFAULT; > } > > files = get_files_struct(current); > if (files == NULL) { > - pr_debug("Failed to get files struct\n"); > + printk(KERN_DEBUG "Failed to get files struct\n"); > return -EFAULT; > } > > rcu_read_lock(); > file = fcheck_files(files, eventfd_copy.source_fd); > if (file) { > - if (file->f_mode & FMODE_PATH || > - !atomic_long_inc_not_zero(&file->f_count)) > + if (file->f_mode & FMODE_PATH > + || !atomic_long_inc_not_zero(&file->f_count)) > file = NULL; > } > rcu_read_unlock(); > put_files_struct(files); > > if (file == NULL) { > - pr_debug("Failed to get file from source pid\n"); > + printk(KERN_DEBUG "Failed to get file from source pid\n"); > return 0; > } > > @@ -128,25 +126,26 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) > > files = get_files_struct(task_target); > if (files == NULL) { > - pr_debug("Failed to get files struct\n"); > + printk(KERN_DEBUG "Failed to get files struct\n"); > return -EFAULT; > } > > rcu_read_lock(); > file = fcheck_files(files, eventfd_copy.target_fd); > if (file) { > - if (file->f_mode & FMODE_PATH || > - !atomic_long_inc_not_zero(&file->f_count)) > - file = NULL; > + if (file->f_mode & FMODE_PATH > + || !atomic_long_inc_not_zero(&file->f_count)) > + file = NULL; > } > rcu_read_unlock(); > put_files_struct(files); > > if (file == NULL) { > - pr_debug("Failed to get file from target pid\n"); > + printk(KERN_DEBUG "Failed to get file from target pid\n"); > return 0; > } > > + > /* > * Install the file struct from the target process into the > * file desciptor of the source process, > diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.h b/lib/librte_vhost/eventfd_link/eventfd_link.h > index ea619ec..38052e2 100644 > --- a/lib/librte_vhost/eventfd_link/eventfd_link.h > +++ b/lib/librte_vhost/eventfd_link/eventfd_link.h > @@ -1,7 +1,4 @@ > /*- > - * This file is provided under a dual BSD/GPLv2 license. When using or > - * redistributing this file, you may do so under either license. > - * > * GPL LICENSE SUMMARY > * > * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > @@ -16,61 +13,28 @@ > * General Public License for more details. > * > * You should have received a copy of the GNU General Public License > - * along with this program; if not, write to the Free Software > - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. > + * along with this program; If not, see <http://www.gnu.org/licenses/>. > * The full GNU General Public License is included in this distribution > * in the file called LICENSE.GPL. > * > * Contact Information: > * Intel Corporation > - * > - * BSD LICENSE > - * > - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > - * All rights reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions > - * are met: > - * > - * Redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in > - * the documentation and/or other materials provided with the > - * distribution. > - * Neither the name of Intel Corporation nor the names of its > - * contributors may be used to endorse or promote products derived > - * from this software without specific prior written permission. > - * > - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > - * > */ > > #ifndef _EVENTFD_LINK_H_ > #define _EVENTFD_LINK_H_ > > /* > - * ioctl to copy an fd entry in calling process to an fd in a target process > + * ioctl to copy an fd entry in calling process to an fd in a target process > */ > #define EVENTFD_COPY 1 > > /* > - * arguements for the EVENTFD_COPY ioctl > + * arguements for the EVENTFD_COPY ioctl > */ > struct eventfd_copy { > - unsigned target_fd; /* fd in the target pid */ > - unsigned source_fd; /* fd in the calling pid */ > - pid_t target_pid; /* pid of the target pid */ > + unsigned target_fd; /**< fd in the target pid */ > + unsigned source_fd; /**< fd in the calling pid */ > + pid_t target_pid; /**< pid of the target pid */ > }; > #endif /* _EVENTFD_LINK_H_ */ > diff --git a/lib/librte_vhost/libvirt/qemu-wrap.py b/lib/librte_vhost/libvirt/qemu-wrap.py > deleted file mode 100755 > index e2d68a0..0000000 > --- a/lib/librte_vhost/libvirt/qemu-wrap.py > +++ /dev/null > @@ -1,367 +0,0 @@ > -#!/usr/bin/python > -#/* > -# * BSD LICENSE > -# * > -# * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > -# * All rights reserved. > -# * > -# * Redistribution and use in source and binary forms, with or without > -# * modification, are permitted provided that the following conditions > -# * are met: > -# * > -# * * Redistributions of source code must retain the above copyright > -# * notice, this list of conditions and the following disclaimer. > -# * * Redistributions in binary form must reproduce the above copyright > -# * notice, this list of conditions and the following disclaimer in > -# * the documentation and/or other materials provided with the > -# * distribution. > -# * * Neither the name of Intel Corporation nor the names of its > -# * contributors may be used to endorse or promote products derived > -# * from this software without specific prior written permission. > -# * > -# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > -# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > -# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > -# * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > -# * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > -# * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > -# * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > -# * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > -# * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > -# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > -# * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > -# */ > - > -##################################################################### > -# This script is designed to modify the call to the QEMU emulator > -# to support userspace vhost when starting a guest machine through > -# libvirt with vhost enabled. The steps to enable this are as follows > -# and should be run as root: > -# > -# 1. Place this script in a libvirtd's binary search PATH ($PATH) > -# A good location would be in the same directory that the QEMU > -# binary is located > -# > -# 2. Ensure that the script has the same owner/group and file > -# permissions as the QEMU binary > -# > -# 3. Update the VM xml file using "virsh edit VM.xml" > -# > -# 3.a) Set the VM to use the launch script > -# > -# Set the emulator path contained in the > -# <emulator><emulator/> tags > -# > -# e.g replace <emulator>/usr/bin/qemu-kvm<emulator/> > -# with <emulator>/usr/bin/qemu-wrap.py<emulator/> > -# > -# 3.b) Set the VM's device's to use vhost-net offload > -# > -# <interface type="network"> > -# <model type="virtio"/> > -# <driver name="vhost"/> > -# <interface/> > -# > -# 4. Enable libvirt to access our userpace device file by adding it to > -# controllers cgroup for libvirtd using the following steps > -# > -# 4.a) In /etc/libvirt/qemu.conf add/edit the following lines: > -# 1) cgroup_controllers = [ ... "devices", ... ] > -# 2) clear_emulator_capabilities = 0 > -# 3) user = "root" > -# 4) group = "root" > -# 5) cgroup_device_acl = [ > -# "/dev/null", "/dev/full", "/dev/zero", > -# "/dev/random", "/dev/urandom", > -# "/dev/ptmx", "/dev/kvm", "/dev/kqemu", > -# "/dev/rtc", "/dev/hpet", "/dev/net/tun", > -# "/dev/<devbase-name>-<index>", > -# ] > -# > -# 4.b) Disable SELinux or set to permissive mode > -# > -# 4.c) Mount cgroup device controller > -# "mkdir /dev/cgroup" > -# "mount -t cgroup none /dev/cgroup -o devices" > -# > -# 4.d) Set hugetlbfs_mount variable - ( Optional ) > -# VMs using userspace vhost must use hugepage backed > -# memory. This can be enabled in the libvirt XML > -# config by adding a memory backing section to the > -# XML config e.g. > -# <memoryBacking> > -# <hugepages/> > -# </memoryBacking> > -# This memory backing section should be added after the > -# <memory> and <currentMemory> sections. This will add > -# flags "-mem-prealloc -mem-path <path>" to the QEMU > -# command line. The hugetlbfs_mount variable can be used > -# to override the default <path> passed through by libvirt. > -# > -# if "-mem-prealloc" or "-mem-path <path>" are not passed > -# through and a vhost device is detected then these options will > -# be automatically added by this script. This script will detect > -# the system hugetlbfs mount point to be used for <path>. The > -# default <path> for this script can be overidden by the > -# hugetlbfs_dir variable in the configuration section of this script. > -# > -# > -# 4.e) Restart the libvirtd system process > -# e.g. on Fedora "systemctl restart libvirtd.service" > -# > -# > -# 4.f) Edit the Configuration Parameters section of this script > -# to point to the correct emulator location and set any > -# addition options > -# > -# The script modifies the libvirtd Qemu call by modifying/adding > -# options based on the configuration parameters below. > -# NOTE: > -# emul_path and us_vhost_path must be set > -# All other parameters are optional > -##################################################################### > - > - > -############################################# > -# Configuration Parameters > -############################################# > -#Path to QEMU binary > -emul_path = "/usr/local/bin/qemu-system-x86_64" > - > -#Path to userspace vhost device file > -# This filename should match the --dev-basename --dev-index parameters of > -# the command used to launch the userspace vhost sample application e.g. > -# if the sample app lauch command is: > -# ./build/vhost-switch ..... --dev-basename usvhost --dev-index 1 > -# then this variable should be set to: > -# us_vhost_path = "/dev/usvhost-1" > -us_vhost_path = "/dev/usvhost-1" > - > -#List of additional user defined emulation options. These options will > -#be added to all Qemu calls > -emul_opts_user = [] > - > -#List of additional user defined emulation options for vhost only. > -#These options will only be added to vhost enabled guests > -emul_opts_user_vhost = [] > - > -#For all VHOST enabled VMs, the VM memory is preallocated from hugetlbfs > -# Set this variable to one to enable this option for all VMs > -use_huge_all = 0 > - > -#Instead of autodetecting, override the hugetlbfs directory by setting > -#this variable > -hugetlbfs_dir = "" > - > -############################################# > - > - > -############################################# > -# ****** Do Not Modify Below this Line ****** > -############################################# > - > -import sys, os, subprocess > - > - > -#List of open userspace vhost file descriptors > -fd_list = [] > - > -#additional virtio device flags when using userspace vhost > -vhost_flags = [ "csum=off", > - "gso=off", > - "guest_tso4=off", > - "guest_tso6=off", > - "guest_ecn=off" > - ] > - > - > -############################################# > -# Find the system hugefile mount point. > -# Note: > -# if multiple hugetlbfs mount points exist > -# then the first one found will be used > -############################################# > -def find_huge_mount(): > - > - if (len(hugetlbfs_dir)): > - return hugetlbfs_dir > - > - huge_mount = "" > - > - if (os.access("/proc/mounts", os.F_OK)): > - f = open("/proc/mounts", "r") > - line = f.readline() > - while line: > - line_split = line.split(" ") > - if line_split[2] == 'hugetlbfs': > - huge_mount = line_split[1] > - break > - line = f.readline() > - else: > - print "/proc/mounts not found" > - exit (1) > - > - f.close > - if len(huge_mount) == 0: > - print "Failed to find hugetlbfs mount point" > - exit (1) > - > - return huge_mount > - > - > -############################################# > -# Get a userspace Vhost file descriptor > -############################################# > -def get_vhost_fd(): > - > - if (os.access(us_vhost_path, os.F_OK)): > - fd = os.open( us_vhost_path, os.O_RDWR) > - else: > - print ("US-Vhost file %s not found" %us_vhost_path) > - exit (1) > - > - return fd > - > - > -############################################# > -# Check for vhostfd. if found then replace > -# with our own vhost fd and append any vhost > -# flags onto the end > -############################################# > -def modify_netdev_arg(arg): > - > - global fd_list > - vhost_in_use = 0 > - s = '' > - new_opts = [] > - netdev_opts = arg.split(",") > - > - for opt in netdev_opts: > - #check if vhost is used > - if "vhost" == opt[:5]: > - vhost_in_use = 1 > - else: > - new_opts.append(opt) > - > - #if using vhost append vhost options > - if vhost_in_use == 1: > - #append vhost on option > - new_opts.append('vhost=on') > - #append vhostfd ption > - new_fd = get_vhost_fd() > - new_opts.append('vhostfd=' + str(new_fd)) > - fd_list.append(new_fd) > - > - #concatenate all options > - for opt in new_opts: > - if len(s) > 0: > - s+=',' > - > - s+=opt > - > - return s > - > - > -############################################# > -# Main > -############################################# > -def main(): > - > - global fd_list > - global vhost_in_use > - new_args = [] > - num_cmd_args = len(sys.argv) > - emul_call = '' > - mem_prealloc_set = 0 > - mem_path_set = 0 > - num = 0; > - > - #parse the parameters > - while (num < num_cmd_args): > - arg = sys.argv[num] > - > - #Check netdev +1 parameter for vhostfd > - if arg == '-netdev': > - num_vhost_devs = len(fd_list) > - new_args.append(arg) > - > - num+=1 > - arg = sys.argv[num] > - mod_arg = modify_netdev_arg(arg) > - new_args.append(mod_arg) > - > - #append vhost flags if this is a vhost device > - # and -device is the next arg > - # i.e -device -opt1,-opt2,...,-opt3,%vhost > - if (num_vhost_devs < len(fd_list)): > - num+=1 > - arg = sys.argv[num] > - if arg == '-device': > - new_args.append(arg) > - num+=1 > - new_arg = sys.argv[num] > - for flag in vhost_flags: > - new_arg = ''.join([new_arg,',',flag]) > - new_args.append(new_arg) > - else: > - new_args.append(arg) > - elif arg == '-mem-prealloc': > - mem_prealloc_set = 1 > - new_args.append(arg) > - elif arg == '-mem-path': > - mem_path_set = 1 > - new_args.append(arg) > - > - else: > - new_args.append(arg) > - > - num+=1 > - > - #Set Qemu binary location > - emul_call+=emul_path > - emul_call+=" " > - > - #Add prealloc mem options if using vhost and not already added > - if ((len(fd_list) > 0) and (mem_prealloc_set == 0)): > - emul_call += "-mem-prealloc " > - > - #Add mempath mem options if using vhost and not already added > - if ((len(fd_list) > 0) and (mem_path_set == 0)): > - #Detect and add hugetlbfs mount point > - mp = find_huge_mount() > - mp = "".join(["-mem-path ", mp]) > - emul_call += mp > - emul_call += " " > - > - > - #add user options > - for opt in emul_opts_user: > - emul_call += opt > - emul_call += " " > - > - #Add add user vhost only options > - if len(fd_list) > 0: > - for opt in emul_opts_user_vhost: > - emul_call += opt > - emul_call += " " > - > - #Add updated libvirt options > - iter_args = iter(new_args) > - #skip 1st arg i.e. call to this script > - next(iter_args) > - for arg in iter_args: > - emul_call+=str(arg) > - emul_call+= " " > - > - #Call QEMU > - subprocess.call(emul_call, shell=True) > - > - > - #Close usvhost files > - for fd in fd_list: > - os.close(fd) > - > - > -if __name__ == "__main__": > - main() > - > diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h > index 00b1328..7a05dab 100644 > --- a/lib/librte_vhost/rte_virtio_net.h > +++ b/lib/librte_vhost/rte_virtio_net.h > @@ -34,11 +34,6 @@ > #ifndef _VIRTIO_NET_H_ > #define _VIRTIO_NET_H_ > > -/** > - * @file > - * Interface to vhost net > - */ > - > #include <stdint.h> > #include <linux/virtio_ring.h> > #include <linux/virtio_net.h> > @@ -48,66 +43,38 @@ > #include <rte_mempool.h> > #include <rte_mbuf.h> > > -/* Used to indicate that the device is running on a data core */ > -#define VIRTIO_DEV_RUNNING 1 > - > -/* Backend value set by guest. */ > -#define VIRTIO_DEV_STOPPED -1 > - > +#define VIRTIO_DEV_RUNNING 1 /**< Used to indicate that the device is running on a data core. */ > +#define VIRTIO_DEV_STOPPED -1 /**< Backend value set by guest. */ > > /* Enum for virtqueue management. */ > enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; > > -#define BUF_VECTOR_MAX 256 > - > -/** > - * Structure contains buffer address, length and descriptor index > - * from vring to do scatter RX. > - */ > -struct buf_vector { > - uint64_t buf_addr; > - uint32_t buf_len; > - uint32_t desc_idx; > -}; > - > /** > * Structure contains variables relevant to RX/TX virtqueues. > */ > struct vhost_virtqueue { > - struct vring_desc *desc; /**< Virtqueue descriptor ring. */ > - struct vring_avail *avail; /**< Virtqueue available ring. */ > - struct vring_used *used; /**< Virtqueue used ring. */ > - uint32_t size; /**< Size of descriptor ring. */ > - uint32_t backend; /**< Backend value to determine if device should started/stopped. */ > - uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ > - volatile uint16_t last_used_idx; /**< Last index used on the available ring */ > - volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ > - eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ > - eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ > - struct buf_vector buf_vec[BUF_VECTOR_MAX]; /**< for scatter RX. */ > -} __rte_cache_aligned; > - > -/** > - * Device structure contains all configuration information relating to the device. > - */ > -struct virtio_net { > - struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ > - struct virtio_memory *mem; /**< QEMU memory and memory region information. */ > - uint64_t features; /**< Negotiated feature set. */ > - uint64_t device_fh; /**< device identifier. */ > - uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ > - void *priv; /**< private context */ > + struct vring_desc *desc; /**< descriptor ring. */ > + struct vring_avail *avail; /**< available ring. */ > + struct vring_used *used; /**< used ring. */ > + uint32_t size; /**< Size of descriptor ring. */ > + uint32_t backend; /**< Backend value to determine if device should be started/stopped. */ > + uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ > + volatile uint16_t last_used_idx; /**< Last index used on the available ring. */ > + volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ > + eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ > + eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ > } __rte_cache_aligned; > > /** > - * Information relating to memory regions including offsets to addresses in QEMUs memory file. > + * Information relating to memory regions including offsets to > + * addresses in QEMUs memory file. > */ > struct virtio_memory_regions { > - uint64_t guest_phys_address; /**< Base guest physical address of region. */ > - uint64_t guest_phys_address_end; /**< End guest physical address of region. */ > - uint64_t memory_size; /**< Size of region. */ > - uint64_t userspace_address; /**< Base userspace address of region. */ > - uint64_t address_offset; /**< Offset of region for address translation. */ > + uint64_t guest_phys_address; /**< Base guest physical address of region. */ > + uint64_t guest_phys_address_end; /**< End guest physical address of region. */ > + uint64_t memory_size; /**< Size of region. */ > + uint64_t userspace_address; /**< Base userspace address of region. */ > + uint64_t address_offset; /**< Offset of region for address translation. */ > }; > > > @@ -115,21 +82,34 @@ struct virtio_memory_regions { > * Memory structure includes region and mapping information. > */ > struct virtio_memory { > - uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ > - uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ > - uint64_t mapped_size; /**< Total size of memory file. */ > - uint32_t nregions; /**< Number of memory regions. */ > + uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ > + uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ > + uint64_t mapped_size; /**< Total size of memory file. */ > + uint32_t nregions; /**< Number of memory regions. */ > struct virtio_memory_regions regions[0]; /**< Memory region information. */ > }; > > /** > + * Device structure contains all configuration information relating to the device. > + */ > +struct virtio_net { > + struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ > + struct virtio_memory *mem; /**< QEMU memory and memory region information. */ > + uint64_t features; /**< Negotiated feature set. */ > + uint64_t device_fh; /**< Device identifier. */ > + uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ > + void *priv; > +} __rte_cache_aligned; > + > +/** > * Device operations to add/remove device. > */ > struct virtio_net_device_ops { > - int (*new_device)(struct virtio_net *); /**< Add device. */ > - void (*destroy_device)(volatile struct virtio_net *); /**< Remove device. */ > + int (*new_device)(struct virtio_net *); /**< Add device. */ > + void (*destroy_device)(struct virtio_net *); /**< Remove device. */ > }; > > + > static inline uint16_t __attribute__((always_inline)) > rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id) > { > @@ -179,7 +159,7 @@ int rte_vhost_driver_register(const char *dev_name); > > /* Register callbacks. */ > int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); > -/* Start vhost driver session blocking loop. */ > + > int rte_vhost_driver_session_start(void); > > /** > @@ -192,8 +172,8 @@ int rte_vhost_driver_session_start(void); > * @return > * num of packets enqueued > */ > -uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, > - struct rte_mbuf **pkts, uint16_t count); > +uint32_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, > + struct rte_mbuf **pkts, uint32_t count); > > /** > * This function gets guest buffers from the virtio device TX virtqueue, > @@ -206,7 +186,7 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, > * @return > * num of packets dequeued > */ > -uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, > - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); > +uint32_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, > + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count); > > #endif /* _VIRTIO_NET_H_ */ > diff --git a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c > new file mode 100644 > index 0000000..4671643 > --- /dev/null > +++ b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c > @@ -0,0 +1,436 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include <stdint.h> > +#include <fuse/cuse_lowlevel.h> > +#include <linux/limits.h> > +#include <linux/vhost.h> > +#include <linux/virtio_net.h> > +#include <string.h> > +#include <unistd.h> > +#include <sys/ioctl.h> > + > +#include <rte_ethdev.h> > +#include <rte_log.h> > +#include <rte_string_fns.h> > +#include <rte_virtio_net.h> > + > +#include "virtio-net-cdev.h" > +#include "vhost-net.h" > +#include "eventfd_link/eventfd_link.h" > + > +#define FUSE_OPT_DUMMY "\0\0" > +#define FUSE_OPT_FORE "-f\0\0" > +#define FUSE_OPT_NOMULTI "-s\0\0" > + > +static const uint32_t default_major = 231; > +static const uint32_t default_minor = 1; > +static const char cuse_device_name[] = "/dev/cuse"; > +static const char default_cdev[] = "vhost-net"; > +static const char eventfd_cdev[] = "/dev/eventfd-link"; > + > +static struct fuse_session *session; > +const struct vhost_net_device_ops const *ops; > + > +/* > + * Returns vhost_device_ctx from given fuse_req_t. The index is populated later > + * when the device is added to the device linked list. > + */ > +static struct vhost_device_ctx > +fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) > +{ > + struct vhost_device_ctx ctx; > + struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); > + > + ctx.pid = req_ctx->pid; > + ctx.fh = fi->fh; > + > + return ctx; > +} > + > +/* > + * When the device is created in QEMU it gets initialised here and > + * added to the device linked list. > + */ > +static void > +vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) > +{ > + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); > + int err = 0; > + > + err = ops->new_device(ctx); > + if (err == -1) { > + fuse_reply_err(req, EPERM); > + return; > + } > + > + fi->fh = err; > + > + RTE_LOG(INFO, VHOST_CONFIG, > + "(%"PRIu64") Device configuration started\n", fi->fh); > + fuse_reply_open(req, fi); > +} > + > +/* > + * When QEMU is shutdown or killed the device gets released. > + */ > +static void > +vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) > +{ > + int err = 0; > + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); > + > + ops->destroy_device(ctx); > + RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); > + fuse_reply_err(req, err); > +} > + > +/* > + * Boilerplate code for CUSE IOCTL > + * Implicit arguments: ctx, req, result. > + */ > +#define VHOST_IOCTL(func) do { \ > + result = (func)(ctx); \ > + fuse_reply_ioctl(req, result, NULL, 0); \ > +} while (0) > + > +/* > + * Boilerplate IOCTL RETRY > + * Implicit arguments: req. > + */ > +#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ > + struct iovec iov_r = { arg, (size_r) }; \ > + struct iovec iov_w = { arg, (size_w) }; \ > + fuse_reply_ioctl_retry(req, &iov_r, \ > + (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ > +} while (0) > + > +/* > + * Boilerplate code for CUSE Read IOCTL > + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. > + */ > +#define VHOST_IOCTL_R(type, var, func) do { \ > + if (!in_bufsz) { \ > + VHOST_IOCTL_RETRY(sizeof(type), 0);\ > + } else { \ > + (var) = *(const type*)in_buf; \ > + result = func(ctx, &(var)); \ > + fuse_reply_ioctl(req, result, NULL, 0);\ > + } \ > +} while (0) > + > +/* > + * Boilerplate code for CUSE Write IOCTL > + * Implicit arguments: ctx, req, result, out_bufsz. > + */ > +#define VHOST_IOCTL_W(type, var, func) do { \ > + if (!out_bufsz) { \ > + VHOST_IOCTL_RETRY(0, sizeof(type));\ > + } else { \ > + result = (func)(ctx, &(var));\ > + fuse_reply_ioctl(req, result, &(var), sizeof(type));\ > + } \ > +} while (0) > + > +/* > + * Boilerplate code for CUSE Read/Write IOCTL > + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. > + */ > +#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ > + if (!in_bufsz) { \ > + VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ > + } else { \ > + (var1) = *(const type1*) (in_buf); \ > + result = (func)(ctx, (var1), &(var2)); \ > + fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ > + } \ > +} while (0) > + > +/* > + * This function uses the eventfd_link kernel module to copy an eventfd file > + * descriptor provided by QEMU in to our process space. > + */ > +static int > +eventfd_copy(int target_fd, int target_pid) > +{ > + int eventfd_link, ret; > + struct eventfd_copy eventfd_copy; > + int fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); > + > + if (fd == -1) > + return -1; > + > + /* Open the character device to the kernel module. */ > + /* TODO: check this earlier rather than fail until VM boots! */ > + eventfd_link = open(eventfd_cdev, O_RDWR); > + if (eventfd_link < 0) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "eventfd_link module is not loaded\n"); > + return -1; > + } > + > + eventfd_copy.source_fd = fd; > + eventfd_copy.target_fd = target_fd; > + eventfd_copy.target_pid = target_pid; > + /* Call the IOCTL to copy the eventfd. */ > + ret = ioctl(eventfd_link, EVENTFD_COPY, &eventfd_copy); > + close(eventfd_link); > + > + if (ret < 0) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "EVENTFD_COPY ioctl failed\n"); > + return -1; > + } > + > + return fd; > +} > + > +/* > + * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on > + * the type of IOCTL a buffer is requested to read or to write. This > + * request is handled by FUSE and the buffer is then given to CUSE. > + */ > +static void > +vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, > + struct fuse_file_info *fi, __rte_unused unsigned flags, > + const void *in_buf, size_t in_bufsz, size_t out_bufsz) > +{ > + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); > + struct vhost_vring_file file; > + struct vhost_vring_state state; > + struct vhost_vring_addr addr; > + uint64_t features; > + uint32_t index; > + int result = 0; > + > + switch (cmd) { > + case VHOST_NET_SET_BACKEND: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); > + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); > + break; > + > + case VHOST_GET_FEATURES: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); > + VHOST_IOCTL_W(uint64_t, features, ops->get_features); > + break; > + > + case VHOST_SET_FEATURES: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); > + VHOST_IOCTL_R(uint64_t, features, ops->set_features); > + break; > + > + case VHOST_RESET_OWNER: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); > + VHOST_IOCTL(ops->reset_owner); > + break; > + > + case VHOST_SET_OWNER: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); > + VHOST_IOCTL(ops->set_owner); > + break; > + > + case VHOST_SET_MEM_TABLE: > + /*TODO fix race condition.*/ > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); > + static struct vhost_memory mem_temp; > + switch (in_bufsz) { > + case 0: > + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); > + break; > + > + case sizeof(struct vhost_memory): > + mem_temp = *(const struct vhost_memory *) in_buf; > + > + if (mem_temp.nregions > 0) { > + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + > + (sizeof(struct vhost_memory_region) * > + mem_temp.nregions), 0); > + } else { > + result = -1; > + fuse_reply_ioctl(req, result, NULL, 0); > + } > + break; > + > + default: > + result = cuse_set_mem_table(ctx, in_buf, > + mem_temp.nregions); > + if (result) > + fuse_reply_err(req, EINVAL); > + else > + fuse_reply_ioctl(req, result, NULL, 0); > + } > + break; > + > + case VHOST_SET_VRING_NUM: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); > + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_num); > + break; > + > + case VHOST_SET_VRING_BASE: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); > + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_base); > + break; > + > + case VHOST_GET_VRING_BASE: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); > + VHOST_IOCTL_RW(uint32_t, index, > + struct vhost_vring_state, state, ops->get_vring_base); > + break; > + > + case VHOST_SET_VRING_ADDR: > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); > + VHOST_IOCTL_R(struct vhost_vring_addr, addr, ops->set_vring_addr); > + break; > + > + case VHOST_SET_VRING_KICK: > + case VHOST_SET_VRING_CALL: > + if (!in_buf) { > + VHOST_IOCTL_RETRY(sizeof(struct vhost_vring_file), 0); > + } else { > + int fd; > + file = *(const struct vhost_vring_file *)in_buf; > + LOG_DEBUG(VHOST_CONFIG, > + "kick/call idx:%d fd:%d\n", file.index, file.fd); > + if ((fd = eventfd_copy(file.fd, ctx.pid)) < 0){ > + fuse_reply_ioctl(req, -1, NULL, 0); > + } > + file.fd = fd; > + if (cmd == VHOST_SET_VRING_KICK) { > + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_call); > + } > + else { > + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_kick); > + } > + } > + break; > + > + default: > + RTE_LOG(ERR, VHOST_CONFIG, > + "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); > + result = -1; > + fuse_reply_ioctl(req, result, NULL, 0); > + } > + > + if (result < 0) > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); > + else > + LOG_DEBUG(VHOST_CONFIG, > + "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); > +} > + > +/* > + * Structure handling open, release and ioctl function pointers is populated. > + */ > +static const struct cuse_lowlevel_ops vhost_net_ops = { > + .open = vhost_net_open, > + .release = vhost_net_release, > + .ioctl = vhost_net_ioctl, > +}; > + > +/* > + * cuse_info is populated and used to register the cuse device. > + * vhost_net_device_ops are also passed when the device is registered in app. > + */ > +int > +rte_vhost_driver_register(const char *dev_name) > +{ > + struct cuse_info cuse_info; > + char device_name[PATH_MAX] = ""; > + char char_device_name[PATH_MAX] = ""; > + const char *device_argv[] = { device_name }; > + > + char fuse_opt_dummy[] = FUSE_OPT_DUMMY; > + char fuse_opt_fore[] = FUSE_OPT_FORE; > + char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; > + char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; > + > + if (access(cuse_device_name, R_OK | W_OK) < 0) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "char device %s can't be accessed, maybe not exist\n", > + cuse_device_name); > + return -1; > + } > + > + /* > + * The device name is created. This is passed to QEMU so that it can > + * register the device with our application. > + */ > + snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); > + snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); > + > + /* Check if device already exists. */ > + if (access(char_device_name, F_OK) != -1) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "char device %s already exists\n", char_device_name); > + return -1; > + } > + > + memset(&cuse_info, 0, sizeof(cuse_info)); > + cuse_info.dev_major = default_major; > + cuse_info.dev_minor = default_minor; > + cuse_info.dev_info_argc = 1; > + cuse_info.dev_info_argv = device_argv; > + cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; > + > + ops = get_virtio_net_callbacks(); > + > + session = cuse_lowlevel_setup(3, fuse_argv, > + &cuse_info, &vhost_net_ops, 0, NULL); > + if (session == NULL) > + return -1; > + > + return 0; > +} > + > +/** > + * The CUSE session is launched allowing the application to receive open, > + * release and ioctl calls. > + */ > +int > +rte_vhost_driver_session_start(void) > +{ > + fuse_session_loop(session); > + > + return 0; > +} > diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c > new file mode 100644 > index 0000000..5c16aa5 > --- /dev/null > +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c > @@ -0,0 +1,314 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include <stdint.h> > +#include <dirent.h> > +#include <linux/vhost.h> > +#include <linux/virtio_net.h> > +#include <fuse/cuse_lowlevel.h> > +#include <stddef.h> > +#include <string.h> > +#include <stdlib.h> > +#include <sys/eventfd.h> > +#include <sys/mman.h> > +#include <sys/types.h> > +#include <unistd.h> > +#include <errno.h> > + > +#include <rte_log.h> > + > +#include "vhost-net.h" > +#include "virtio-net-cdev.h" > + > +extern struct vhost_net_device_ops const *ops; > + > +/* Line size for reading maps file. */ > +static const uint32_t BUFSIZE = PATH_MAX; > + > +/* Size of prot char array in procmap. */ > +#define PROT_SZ 5 > + > +/* Number of elements in procmap struct. */ > +#define PROCMAP_SZ 8 > + > +/* Structure containing information gathered from maps file. */ > +struct procmap { > + uint64_t va_start; /* Start virtual address in file. */ > + uint64_t len; /* Size of file. */ > + uint64_t pgoff; /* Not used. */ > + uint32_t maj; /* Not used. */ > + uint32_t min; /* Not used. */ > + uint32_t ino; /* Not used. */ > + char prot[PROT_SZ]; /* Not used. */ > + char fname[PATH_MAX]; /* File name. */ > +}; > + > +/* > + * Locate the file containing QEMU's memory space and > + * map it to our address space. > + */ > +static int > +host_memory_map(pid_t pid, uint64_t addr, > + uint64_t *mapped_address, uint64_t *mapped_size) > +{ > + struct dirent *dptr = NULL; > + struct procmap procmap; > + DIR *dp = NULL; > + int fd; > + int i; > + char memfile[PATH_MAX]; > + char mapfile[PATH_MAX]; > + char procdir[PATH_MAX]; > + char resolved_path[PATH_MAX]; > + FILE *fmap; > + void *map; > + uint8_t found = 0; > + char line[BUFSIZE]; > + char dlm[] = "- : "; > + char *str, *sp, *in[PROCMAP_SZ]; > + char *end = NULL; > + > + /* Path where mem files are located. */ > + snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); > + /* Maps file used to locate mem file. */ > + snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); > + > + fmap = fopen(mapfile, "r"); > + if (fmap == NULL) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Failed to open maps file for pid %d\n", pid); > + return -1; > + } > + > + /* Read through maps file until we find out base_address. */ > + while (fgets(line, BUFSIZE, fmap) != 0) { > + str = line; > + errno = 0; > + /* Split line in to fields. */ > + for (i = 0; i < PROCMAP_SZ; i++) { > + in[i] = strtok_r(str, &dlm[i], &sp); > + if ((in[i] == NULL) || (errno != 0)) { > + fclose(fmap); > + return -1; > + } > + str = NULL; > + } > + > + /* Convert/Copy each field as needed. */ > + procmap.va_start = strtoull(in[0], &end, 16); > + if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || > + (errno != 0)) { > + fclose(fmap); > + return -1; > + } > + > + procmap.len = strtoull(in[1], &end, 16); > + if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || > + (errno != 0)) { > + fclose(fmap); > + return -1; > + } > + > + procmap.pgoff = strtoull(in[3], &end, 16); > + if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || > + (errno != 0)) { > + fclose(fmap); > + return -1; > + } > + > + procmap.maj = strtoul(in[4], &end, 16); > + if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || > + (errno != 0)) { > + fclose(fmap); > + return -1; > + } > + > + procmap.min = strtoul(in[5], &end, 16); > + if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || > + (errno != 0)) { > + fclose(fmap); > + return -1; > + } > + > + procmap.ino = strtoul(in[6], &end, 16); > + if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || > + (errno != 0)) { > + fclose(fmap); > + return -1; > + } > + > + memcpy(&procmap.prot, in[2], PROT_SZ); > + memcpy(&procmap.fname, in[7], PATH_MAX); > + > + if (procmap.va_start == addr) { > + procmap.len = procmap.len - procmap.va_start; > + found = 1; > + break; > + } > + } > + fclose(fmap); > + > + if (!found) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Failed to find memory file in pid %d maps file\n", pid); > + return -1; > + } > + > + /* Find the guest memory file among the process fds. */ > + dp = opendir(procdir); > + if (dp == NULL) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Cannot open pid %d process directory\n", > + pid); > + return -1; > + > + } > + > + found = 0; > + > + /* Read the fd directory contents. */ > + while (NULL != (dptr = readdir(dp))) { > + snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", > + pid, dptr->d_name); > + realpath(memfile, resolved_path); > + if (resolved_path == NULL) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Failed to resolve fd directory\n"); > + closedir(dp); > + return -1; > + } > + if (strncmp(resolved_path, procmap.fname, > + strnlen(procmap.fname, PATH_MAX)) == 0) { > + found = 1; > + break; > + } > + } > + > + closedir(dp); > + > + if (found == 0) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Failed to find memory file for pid %d\n", > + pid); > + return -1; > + } > + /* Open the shared memory file and map the memory into this process. */ > + fd = open(memfile, O_RDWR); > + > + if (fd == -1) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Failed to open %s for pid %d\n", > + memfile, pid); > + return -1; > + } > + > + map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE , > + MAP_POPULATE|MAP_SHARED, fd, 0); > + close(fd); > + > + if (map == MAP_FAILED) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Error mapping the file %s for pid %d\n", > + memfile, pid); > + return -1; > + } > + > + /* Store the memory address and size in the device data structure */ > + *mapped_address = (uint64_t)(uintptr_t)map; > + *mapped_size = procmap.len; > + > + LOG_DEBUG(VHOST_CONFIG, > + "Mem File: %s->%s - Size: %llu - VA: %p\n", > + memfile, resolved_path, > + (unsigned long long)mapped_size, map); > + > + return 0; > +} > + > +int > +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, > + uint32_t nregions) > +{ > + uint64_t size = offsetof(struct vhost_memory, regions); > + uint32_t idx; > + struct virtio_memory_regions regions[8]; /* VHOST_MAX_MEMORY_REGIONS */ > + struct vhost_memory_region *mem_regions = (void *)(uintptr_t) > + ((uint64_t)(uintptr_t)mem_regions_addr + size); > + uint64_t base_address = 0, mapped_address, mapped_size; > + > + for (idx = 0; idx < nregions; idx++) { > + regions[idx].guest_phys_address = > + mem_regions[idx].guest_phys_addr; > + regions[idx].guest_phys_address_end = > + regions[idx].guest_phys_address + > + mem_regions[idx].memory_size; > + regions[idx].memory_size = > + mem_regions[idx].memory_size; > + regions[idx].userspace_address = > + mem_regions[idx].userspace_addr; > + > + LOG_DEBUG(VHOST_CONFIG, "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", > + idx, > + (void *)(uintptr_t)regions[idx].guest_phys_address, > + (void *)(uintptr_t)regions[idx].userspace_address, > + regions[idx].memory_size); > + > + /*set the base address mapping*/ > + if (regions[idx].guest_phys_address == 0x0) { > + base_address = > + regions[idx].userspace_address; > + /* Map VM memory file */ > + if (host_memory_map(ctx.pid, base_address, > + &mapped_address, &mapped_size) != 0) { > + return -1; > + } > + } > + } > + > + /* Check that we have a valid base address. */ > + if (base_address == 0) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "Failed to find base address of qemu memory file.\n"); > + return -1; > + } > + > + for (idx = 0; idx < nregions; idx++) { > + regions[idx].address_offset = > + mapped_address - base_address + > + regions[idx].userspace_address - > + regions[idx].guest_phys_address; > + } > + > + ops->set_mem_table(ctx, ®ions[0], nregions); > + return 0; > +} > diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h > new file mode 100644 > index 0000000..6f98ce8 > --- /dev/null > +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h > @@ -0,0 +1,43 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > +#ifndef _VIRTIO_NET_CDEV_H > +#define _VIRTIO_NET_CDEV_H > +#include <stdint.h> > + > +#include "vhost-net.h" > + > +int > +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, > + uint32_t nregions); > + > +#endif > diff --git a/lib/librte_vhost/vhost-net-cdev.c b/lib/librte_vhost/vhost-net-cdev.c > deleted file mode 100644 > index 57c76cb..0000000 > --- a/lib/librte_vhost/vhost-net-cdev.c > +++ /dev/null > @@ -1,389 +0,0 @@ > -/*- > - * BSD LICENSE > - * > - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > - * All rights reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions > - * are met: > - * > - * * Redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer. > - * * Redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in > - * the documentation and/or other materials provided with the > - * distribution. > - * * Neither the name of Intel Corporation nor the names of its > - * contributors may be used to endorse or promote products derived > - * from this software without specific prior written permission. > - * > - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > - */ > - > -#include <errno.h> > -#include <fuse/cuse_lowlevel.h> > -#include <linux/limits.h> > -#include <linux/vhost.h> > -#include <stdint.h> > -#include <string.h> > -#include <unistd.h> > - > -#include <rte_ethdev.h> > -#include <rte_log.h> > -#include <rte_string_fns.h> > -#include <rte_virtio_net.h> > - > -#include "vhost-net-cdev.h" > - > -#define FUSE_OPT_DUMMY "\0\0" > -#define FUSE_OPT_FORE "-f\0\0" > -#define FUSE_OPT_NOMULTI "-s\0\0" > - > -static const uint32_t default_major = 231; > -static const uint32_t default_minor = 1; > -static const char cuse_device_name[] = "/dev/cuse"; > -static const char default_cdev[] = "vhost-net"; > - > -static struct fuse_session *session; > -static struct vhost_net_device_ops const *ops; > - > -/* > - * Returns vhost_device_ctx from given fuse_req_t. The index is populated later > - * when the device is added to the device linked list. > - */ > -static struct vhost_device_ctx > -fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) > -{ > - struct vhost_device_ctx ctx; > - struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); > - > - ctx.pid = req_ctx->pid; > - ctx.fh = fi->fh; > - > - return ctx; > -} > - > -/* > - * When the device is created in QEMU it gets initialised here and > - * added to the device linked list. > - */ > -static void > -vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) > -{ > - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); > - int err = 0; > - > - err = ops->new_device(ctx); > - if (err == -1) { > - fuse_reply_err(req, EPERM); > - return; > - } > - > - fi->fh = err; > - > - RTE_LOG(INFO, VHOST_CONFIG, > - "(%"PRIu64") Device configuration started\n", fi->fh); > - fuse_reply_open(req, fi); > -} > - > -/* > - * When QEMU is shutdown or killed the device gets released. > - */ > -static void > -vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) > -{ > - int err = 0; > - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); > - > - ops->destroy_device(ctx); > - RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); > - fuse_reply_err(req, err); > -} > - > -/* > - * Boilerplate code for CUSE IOCTL > - * Implicit arguments: ctx, req, result. > - */ > -#define VHOST_IOCTL(func) do { \ > - result = (func)(ctx); \ > - fuse_reply_ioctl(req, result, NULL, 0); \ > -} while (0) > - > -/* > - * Boilerplate IOCTL RETRY > - * Implicit arguments: req. > - */ > -#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ > - struct iovec iov_r = { arg, (size_r) }; \ > - struct iovec iov_w = { arg, (size_w) }; \ > - fuse_reply_ioctl_retry(req, &iov_r, \ > - (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ > -} while (0) > - > -/* > - * Boilerplate code for CUSE Read IOCTL > - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. > - */ > -#define VHOST_IOCTL_R(type, var, func) do { \ > - if (!in_bufsz) { \ > - VHOST_IOCTL_RETRY(sizeof(type), 0);\ > - } else { \ > - (var) = *(const type*)in_buf; \ > - result = func(ctx, &(var)); \ > - fuse_reply_ioctl(req, result, NULL, 0);\ > - } \ > -} while (0) > - > -/* > - * Boilerplate code for CUSE Write IOCTL > - * Implicit arguments: ctx, req, result, out_bufsz. > - */ > -#define VHOST_IOCTL_W(type, var, func) do { \ > - if (!out_bufsz) { \ > - VHOST_IOCTL_RETRY(0, sizeof(type));\ > - } else { \ > - result = (func)(ctx, &(var));\ > - fuse_reply_ioctl(req, result, &(var), sizeof(type));\ > - } \ > -} while (0) > - > -/* > - * Boilerplate code for CUSE Read/Write IOCTL > - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. > - */ > -#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ > - if (!in_bufsz) { \ > - VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ > - } else { \ > - (var1) = *(const type1*) (in_buf); \ > - result = (func)(ctx, (var1), &(var2)); \ > - fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ > - } \ > -} while (0) > - > -/* > - * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on the type > - * of IOCTL a buffer is requested to read or to write. This request is handled > - * by FUSE and the buffer is then given to CUSE. > - */ > -static void > -vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, > - struct fuse_file_info *fi, __rte_unused unsigned flags, > - const void *in_buf, size_t in_bufsz, size_t out_bufsz) > -{ > - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); > - struct vhost_vring_file file; > - struct vhost_vring_state state; > - struct vhost_vring_addr addr; > - uint64_t features; > - uint32_t index; > - int result = 0; > - > - switch (cmd) { > - case VHOST_NET_SET_BACKEND: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); > - VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); > - break; > - > - case VHOST_GET_FEATURES: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); > - VHOST_IOCTL_W(uint64_t, features, ops->get_features); > - break; > - > - case VHOST_SET_FEATURES: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); > - VHOST_IOCTL_R(uint64_t, features, ops->set_features); > - break; > - > - case VHOST_RESET_OWNER: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); > - VHOST_IOCTL(ops->reset_owner); > - break; > - > - case VHOST_SET_OWNER: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); > - VHOST_IOCTL(ops->set_owner); > - break; > - > - case VHOST_SET_MEM_TABLE: > - /*TODO fix race condition.*/ > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); > - static struct vhost_memory mem_temp; > - > - switch (in_bufsz) { > - case 0: > - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); > - break; > - > - case sizeof(struct vhost_memory): > - mem_temp = *(const struct vhost_memory *) in_buf; > - > - if (mem_temp.nregions > 0) { > - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + > - (sizeof(struct vhost_memory_region) * > - mem_temp.nregions), 0); > - } else { > - result = -1; > - fuse_reply_ioctl(req, result, NULL, 0); > - } > - break; > - > - default: > - result = ops->set_mem_table(ctx, > - in_buf, mem_temp.nregions); > - if (result) > - fuse_reply_err(req, EINVAL); > - else > - fuse_reply_ioctl(req, result, NULL, 0); > - } > - break; > - > - case VHOST_SET_VRING_NUM: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); > - VHOST_IOCTL_R(struct vhost_vring_state, state, > - ops->set_vring_num); > - break; > - > - case VHOST_SET_VRING_BASE: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); > - VHOST_IOCTL_R(struct vhost_vring_state, state, > - ops->set_vring_base); > - break; > - > - case VHOST_GET_VRING_BASE: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); > - VHOST_IOCTL_RW(uint32_t, index, > - struct vhost_vring_state, state, ops->get_vring_base); > - break; > - > - case VHOST_SET_VRING_ADDR: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); > - VHOST_IOCTL_R(struct vhost_vring_addr, addr, > - ops->set_vring_addr); > - break; > - > - case VHOST_SET_VRING_KICK: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_VRING_KICK\n", ctx.fh); > - VHOST_IOCTL_R(struct vhost_vring_file, file, > - ops->set_vring_kick); > - break; > - > - case VHOST_SET_VRING_CALL: > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: VHOST_SET_VRING_CALL\n", ctx.fh); > - VHOST_IOCTL_R(struct vhost_vring_file, file, > - ops->set_vring_call); > - break; > - > - default: > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); > - result = -1; > - fuse_reply_ioctl(req, result, NULL, 0); > - } > - > - if (result < 0) > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); > - else > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); > -} > - > -/* > - * Structure handling open, release and ioctl function pointers is populated. > - */ > -static const struct cuse_lowlevel_ops vhost_net_ops = { > - .open = vhost_net_open, > - .release = vhost_net_release, > - .ioctl = vhost_net_ioctl, > -}; > - > -/* > - * cuse_info is populated and used to register the cuse device. > - * vhost_net_device_ops are also passed when the device is registered in app. > - */ > -int > -rte_vhost_driver_register(const char *dev_name) > -{ > - struct cuse_info cuse_info; > - char device_name[PATH_MAX] = ""; > - char char_device_name[PATH_MAX] = ""; > - const char *device_argv[] = { device_name }; > - > - char fuse_opt_dummy[] = FUSE_OPT_DUMMY; > - char fuse_opt_fore[] = FUSE_OPT_FORE; > - char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; > - char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; > - > - if (access(cuse_device_name, R_OK | W_OK) < 0) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "char device %s can't be accessed, maybe not exist\n", > - cuse_device_name); > - return -1; > - } > - > - /* > - * The device name is created. This is passed to QEMU so that it can > - * register the device with our application. > - */ > - snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); > - snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); > - > - /* Check if device already exists. */ > - if (access(char_device_name, F_OK) != -1) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "char device %s already exists\n", char_device_name); > - return -1; > - } > - > - memset(&cuse_info, 0, sizeof(cuse_info)); > - cuse_info.dev_major = default_major; > - cuse_info.dev_minor = default_minor; > - cuse_info.dev_info_argc = 1; > - cuse_info.dev_info_argv = device_argv; > - cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; > - > - ops = get_virtio_net_callbacks(); > - > - session = cuse_lowlevel_setup(3, fuse_argv, > - &cuse_info, &vhost_net_ops, 0, NULL); > - if (session == NULL) > - return -1; > - > - return 0; > -} > - > -/** > - * The CUSE session is launched allowing the application to receive open, > - * release and ioctl calls. > - */ > -int > -rte_vhost_driver_session_start(void) > -{ > - fuse_session_loop(session); > - > - return 0; > -} > diff --git a/lib/librte_vhost/vhost-net-cdev.h b/lib/librte_vhost/vhost-net-cdev.h > deleted file mode 100644 > index 03a5c57..0000000 > --- a/lib/librte_vhost/vhost-net-cdev.h > +++ /dev/null > @@ -1,113 +0,0 @@ > -/*- > - * BSD LICENSE > - * > - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > - * All rights reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions > - * are met: > - * > - * * Redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer. > - * * Redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in > - * the documentation and/or other materials provided with the > - * distribution. > - * * Neither the name of Intel Corporation nor the names of its > - * contributors may be used to endorse or promote products derived > - * from this software without specific prior written permission. > - * > - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > - */ > - > -#ifndef _VHOST_NET_CDEV_H_ > -#define _VHOST_NET_CDEV_H_ > -#include <stdint.h> > -#include <stdio.h> > -#include <sys/types.h> > -#include <unistd.h> > -#include <linux/vhost.h> > - > -#include <rte_log.h> > - > -/* Macros for printing using RTE_LOG */ > -#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 > -#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 > - > -#ifdef RTE_LIBRTE_VHOST_DEBUG > -#define VHOST_MAX_PRINT_BUFF 6072 > -#define LOG_LEVEL RTE_LOG_DEBUG > -#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) > -#define PRINT_PACKET(device, addr, size, header) do { \ > - char *pkt_addr = (char *)(addr); \ > - unsigned int index; \ > - char packet[VHOST_MAX_PRINT_BUFF]; \ > - \ > - if ((header)) \ > - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ > - else \ > - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ > - for (index = 0; index < (size); index++) { \ > - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ > - "%02hhx ", pkt_addr[index]); \ > - } \ > - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ > - \ > - LOG_DEBUG(VHOST_DATA, "%s", packet); \ > -} while (0) > -#else > -#define LOG_LEVEL RTE_LOG_INFO > -#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) > -#define PRINT_PACKET(device, addr, size, header) do {} while (0) > -#endif > - > - > -/* > - * Structure used to identify device context. > - */ > -struct vhost_device_ctx { > - pid_t pid; /* PID of process calling the IOCTL. */ > - uint64_t fh; /* Populated with fi->fh to track the device index. */ > -}; > - > -/* > - * Structure contains function pointers to be defined in virtio-net.c. These > - * functions are called in CUSE context and are used to configure devices. > - */ > -struct vhost_net_device_ops { > - int (*new_device)(struct vhost_device_ctx); > - void (*destroy_device)(struct vhost_device_ctx); > - > - int (*get_features)(struct vhost_device_ctx, uint64_t *); > - int (*set_features)(struct vhost_device_ctx, uint64_t *); > - > - int (*set_mem_table)(struct vhost_device_ctx, const void *, uint32_t); > - > - int (*set_vring_num)(struct vhost_device_ctx, struct vhost_vring_state *); > - int (*set_vring_addr)(struct vhost_device_ctx, struct vhost_vring_addr *); > - int (*set_vring_base)(struct vhost_device_ctx, struct vhost_vring_state *); > - int (*get_vring_base)(struct vhost_device_ctx, uint32_t, struct vhost_vring_state *); > - > - int (*set_vring_kick)(struct vhost_device_ctx, struct vhost_vring_file *); > - int (*set_vring_call)(struct vhost_device_ctx, struct vhost_vring_file *); > - > - int (*set_backend)(struct vhost_device_ctx, struct vhost_vring_file *); > - > - int (*set_owner)(struct vhost_device_ctx); > - int (*reset_owner)(struct vhost_device_ctx); > -}; > - > - > -struct vhost_net_device_ops const *get_virtio_net_callbacks(void); > -#endif /* _VHOST_NET_CDEV_H_ */ > diff --git a/lib/librte_vhost/vhost-user/fd_man.c b/lib/librte_vhost/vhost-user/fd_man.c > new file mode 100644 > index 0000000..c7fd3f2 > --- /dev/null > +++ b/lib/librte_vhost/vhost-user/fd_man.c > @@ -0,0 +1,158 @@ > +#include <stdint.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <sys/socket.h> > +#include <sys/select.h> > +#include <sys/time.h> > +#include <sys/types.h> > +#include <unistd.h> > + > +#include <rte_log.h> > + > +#include "fd_man.h" > + > +/** > + * Returns the index in the fdset for a fd. > + * If fd is -1, it means to search for a free entry. > + * @return > + * Index for the fd, or -1 if fd isn't in the fdset. > + */ > +static int > +fdset_find_fd(struct fdset *pfdset, int fd) > +{ > + int i; > + > + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++); > + > + return i == pfdset->num ? -1 : i; > +} > + > +static int > +fdset_find_free_slot(struct fdset *pfdset) > +{ > + return fdset_find_fd(pfdset, -1); > + > +} > + > +static void > +fdset_add_fd(struct fdset *pfdset, int idx, int fd, fd_cb rcb, > + fd_cb wcb, uint64_t dat) > +{ > + struct fdentry *pfdentry = &pfdset->fd[idx]; > + > + pfdentry->fd = fd; > + pfdentry->rcb = rcb; > + pfdentry->wcb = wcb; > + pfdentry->dat = dat; > +} > + > +/** > + * Fill the read/write fdset with the fds in the fdset. > + * @return > + * the maximum fds filled in the read/write fd_set. > + */ > +static int > +fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) > +{ > + struct fdentry *pfdentry; > + int i, maxfds = -1; > + int num = MAX_FDS; > + > + for (i = 0; i < num ; i++) { > + pfdentry = &pfdset->fd[i]; > + if (pfdentry->fd != -1) { > + int added = 0; > + if (pfdentry->rcb && rfset) { > + FD_SET(pfdentry->fd, rfset); > + added = 1; > + } > + if (pfdentry->wcb && wfset) { > + FD_SET(pfdentry->fd, wfset); > + added = 1; > + } > + if (added) > + maxfds = pfdentry->fd < maxfds ? > + maxfds : pfdentry->fd; > + } > + } > + return maxfds; > +} > + > +void > +fdset_init(struct fdset *pfdset) > +{ > + int i; > + > + for (i = 0; i < MAX_FDS; i++) > + pfdset->fd[i].fd = -1; > + pfdset->num = MAX_FDS; > + > +} > + > +/** > + * Register the fd in the fdset with its read/write handler and context. > + */ > +int > +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, uint64_t dat) > +{ > + int i; > + > + if (fd == -1) > + return -1; > + > + /* Find a free slot in the list. */ > + i = fdset_find_free_slot(pfdset); > + if (i == -1) > + return -2; > + > + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); > + > + return 0; > +} > + > +/** > + * Unregister the fd from the fdset. > + */ > +void > +fdset_del(struct fdset *pfdset, int fd) > +{ > + int i; > + > + i = fdset_find_fd(pfdset, fd); > + if (i != -1) { > + pfdset->fd[i].fd = -1; > + } > +} > + > + > +void > +fdset_event_dispatch(struct fdset *pfdset) > +{ > + fd_set rfds,wfds; > + int i, maxfds; > + struct fdentry *pfdentry; > + int num = MAX_FDS; > + > + if (pfdset == NULL) > + return; > + while (1) { > + FD_ZERO(&rfds); > + FD_ZERO(&wfds); > + maxfds = fdset_fill(&rfds, &wfds, pfdset); > + /* fd management runs in one thread */ > + if (maxfds == -1) { > + return; > + } > + > + select(maxfds + 1, &rfds, &wfds, NULL, NULL); > + > + for (i = 0; i < num; i++) { > + pfdentry = &pfdset->fd[i]; > + if (FD_ISSET(pfdentry->fd, &rfds)) > + pfdentry->rcb(pfdentry->fd, pfdentry->dat); > + if (FD_ISSET(pfdentry->fd, &wfds)) > + pfdentry->wcb(pfdentry->fd, pfdentry->dat); > + } > + > + } > +} > diff --git a/lib/librte_vhost/vhost-user/fd_man.h b/lib/librte_vhost/vhost-user/fd_man.h > new file mode 100644 > index 0000000..57cc81d > --- /dev/null > +++ b/lib/librte_vhost/vhost-user/fd_man.h > @@ -0,0 +1,31 @@ > +#ifndef _FD_MAN_H_ > +#define _FD_MAN_H_ > +#include <stdint.h> > + > +#define MAX_FDS 1024 > + > +typedef void (*fd_cb)(int fd, uint64_t dat); > + > +struct fdentry { > + int fd; /* -1 indicates this entry is empty */ > + fd_cb rcb; /* callback when this fd is readable. */ > + fd_cb wcb; /* callback when this fd is writeable.*/ > + uint64_t dat; /* fd context */ > +}; > + > +struct fdset { > + struct fdentry fd[MAX_FDS]; > + int num; > +}; > + > + > +void fdset_init(struct fdset *pfdset); > + > +int fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, > + fd_cb wcb, uint64_t ctx); > + > +void fdset_del(struct fdset *pfdset, int fd); > + > +void fdset_event_dispatch(struct fdset *pfdset); > + > +#endif > diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.c b/lib/librte_vhost/vhost-user/vhost-net-user.c > new file mode 100644 > index 0000000..34450f4 > --- /dev/null > +++ b/lib/librte_vhost/vhost-user/vhost-net-user.c > @@ -0,0 +1,417 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include <stdint.h> > +#include <stdio.h> > +#include <limits.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <string.h> > +#include <sys/types.h> > +#include <sys/socket.h> > +#include <sys/un.h> > +#include <errno.h> > + > +#include <rte_log.h> > +#include <rte_virtio_net.h> > + > +#include "fd_man.h" > +#include "vhost-net-user.h" > +#include "vhost-net.h" > +#include "virtio-net-user.h" > + > +static void vserver_new_vq_conn(int fd, uint64_t data); > +static void vserver_message_handler(int fd, uint64_t dat); > +const struct vhost_net_device_ops *ops; > + > +static struct vhost_server *g_vhost_server; > + > +static const char *vhost_message_str[VHOST_USER_MAX] = > +{ > + [VHOST_USER_NONE] = "VHOST_USER_NONE", > + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", > + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", > + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", > + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", > + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", > + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", > + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", > + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", > + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", > + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", > + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", > + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", > + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", > + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR" > +}; > + > +/** > + * Create a unix domain socket and bind to path. > + * @return > + * socket fd or -1 on failure > + */ > +static int > +uds_socket(const char *path) > +{ > + struct sockaddr_un un; > + int sockfd; > + int ret; > + > + if (path == NULL) > + return -1; > + > + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); > + if (sockfd < 0) > + return -1; > + RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd); > + > + memset(&un, 0, sizeof(un)); > + un.sun_family = AF_UNIX; > + snprintf(un.sun_path, sizeof(un.sun_path), "%s", path); > + ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un)); > + if (ret == -1) > + goto err; > + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); > + > + ret = listen(sockfd, 1); > + if (ret == -1) > + goto err; > + > + return sockfd; > + > +err: > + close(sockfd); > + return -1; > +} > + > + > +/* return bytes# of read */ > +static int > +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) > +{ > + > + struct iovec iov; > + struct msghdr msgh = { 0 }; > + size_t fdsize = fd_num * sizeof(int); > + char control[CMSG_SPACE(fdsize)]; > + struct cmsghdr *cmsg; > + int ret; > + > + iov.iov_base = buf; > + iov.iov_len = buflen; > + > + msgh.msg_iov = &iov; > + msgh.msg_iovlen = 1; > + msgh.msg_control = control; > + msgh.msg_controllen = sizeof(control); > + > + ret = recvmsg(sockfd, &msgh, 0); > + if (ret <= 0) { > + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); > + return ret; > + } > + /* ret == buflen */ > + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { > + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); > + return -1; > + } > + > + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; > + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { > + if ( (cmsg->cmsg_level == SOL_SOCKET) && > + (cmsg->cmsg_type == SCM_RIGHTS)) { > + memcpy(fds, CMSG_DATA(cmsg), fdsize); > + break; > + } > + } > + return ret; > +} > + > +static int > +read_vhost_message(int sockfd, struct VhostUserMsg *msg) > +{ > + int ret; > + > + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, > + msg->fds, VHOST_MEMORY_MAX_NREGIONS); > + if (ret <= 0) > + return ret; > + > + if (msg->size) { > + if (msg->size > sizeof(msg->payload)) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "%s: invalid size:%d\n", __func__, msg->size); > + return -1; > + } > + ret = read(sockfd, &msg->payload, msg->size); > + if (ret == 0) > + return 0; > + if (ret != (int)msg->size) { > + printf("read control message failed\n"); > + return -1; > + } > + } > + > + return ret; > +} > + > +static int > +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) > +{ > + > + struct iovec iov; > + struct msghdr msgh = { 0 }; > + size_t fdsize = fd_num * sizeof(int); > + char control[CMSG_SPACE(fdsize)]; > + struct cmsghdr *cmsg; > + int ret; > + > + iov.iov_base = buf; > + iov.iov_len = buflen; > + msgh.msg_iov = &iov; > + msgh.msg_iovlen = 1; > + > + if (fds && fd_num > 0) { > + msgh.msg_control = control; > + msgh.msg_controllen = sizeof(control); > + cmsg = CMSG_FIRSTHDR(&msgh); > + cmsg->cmsg_len = CMSG_LEN(fdsize); > + cmsg->cmsg_level = SOL_SOCKET; > + cmsg->cmsg_type = SCM_RIGHTS; > + memcpy(CMSG_DATA(cmsg), fds, fdsize); > + } else { > + msgh.msg_control = NULL; > + msgh.msg_controllen = 0; > + } > + > + do { > + ret = sendmsg(sockfd, &msgh, 0); > + } while (ret < 0 && errno == EINTR); > + > + if (ret < 0) { > + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); > + return -1; > + } > + > + return 0; > +} > + > +static int > +send_vhost_message(int sockfd, struct VhostUserMsg *msg) > +{ > + int ret; > + > + msg->flags &= ~VHOST_USER_VERSION_MASK; > + msg->flags |= VHOST_USER_VERSION; > + msg->flags |= VHOST_USER_REPLY_MASK; > + > + ret = send_fd_message(sockfd, (char *)msg, > + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); > + > + return ret; > +} > + > +/* call back when there is new connection. */ > +static void > +vserver_new_vq_conn(int fd, uint64_t dat) > +{ > + struct vhost_server *vserver = (void *)(uintptr_t)dat; > + int conn_fd; > + uint32_t fh; > + struct vhost_device_ctx vdev_ctx = { 0 }; > + > + conn_fd = accept(fd, NULL, NULL); > + RTE_LOG(INFO, VHOST_CONFIG, > + "%s: new connection is %d\n", __func__, conn_fd); > + if (conn_fd < 0) > + return; > + > + fh = ops->new_device(vdev_ctx); > + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh); > + > + fdset_add(&vserver->fdset, > + conn_fd, vserver_message_handler, NULL, fh); > +} > + > +/* callback when there is message on the connfd */ > +static void > +vserver_message_handler(int connfd, uint64_t dat) > +{ > + struct vhost_device_ctx ctx; > + uint32_t fh = (uint32_t)dat; > + struct VhostUserMsg msg; > + uint64_t features; > + int ret; > + > + ctx.fh = fh; > + ret = read_vhost_message(connfd, &msg); > + if (ret < 0) { > + printf("vhost read message failed\n"); > + > + /*TODO: cleanup */ > + close(connfd); > + fdset_del(&g_vhost_server->fdset, connfd); > + ops->destroy_device(ctx); > + > + return; > + } else if (ret == 0) { > + /*TODO: cleanup */ > + RTE_LOG(INFO, VHOST_CONFIG, > + "vhost peer closed\n"); > + close(connfd); > + fdset_del(&g_vhost_server->fdset, connfd); > + ops->destroy_device(ctx); > + > + return; > + } > + if (msg.request > VHOST_USER_MAX) { > + /*TODO: cleanup */ > + RTE_LOG(INFO, VHOST_CONFIG, > + "vhost read incorrect message\n"); > + close(connfd); > + fdset_del(&g_vhost_server->fdset, connfd); > + > + return; > + } > + > + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", > + vhost_message_str[msg.request]); > + switch (msg.request) { > + case VHOST_USER_GET_FEATURES: > + ret = ops->get_features(ctx, &features); > + msg.payload.u64 = ret; > + msg.size = sizeof(msg.payload.u64); > + send_vhost_message(connfd, &msg); > + break; > + case VHOST_USER_SET_FEATURES: > + ops->set_features(ctx, &features); > + break; > + > + case VHOST_USER_SET_OWNER: > + ops->set_owner(ctx); > + break; > + case VHOST_USER_RESET_OWNER: > + ops->reset_owner(ctx); > + break; > + > + case VHOST_USER_SET_MEM_TABLE: > + user_set_mem_table(ctx, &msg); > + break; > + > + case VHOST_USER_SET_LOG_BASE: > + case VHOST_USER_SET_LOG_FD: > + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); > + break; > + > + case VHOST_USER_SET_VRING_NUM: > + ops->set_vring_num(ctx, &msg.payload.state); > + break; > + case VHOST_USER_SET_VRING_ADDR: > + ops->set_vring_addr(ctx, &msg.payload.addr); > + break; > + case VHOST_USER_SET_VRING_BASE: > + ops->set_vring_base(ctx, &msg.payload.state); > + break; > + > + case VHOST_USER_GET_VRING_BASE: > + ret = ops->get_vring_base(ctx, msg.payload.state.index, > + &msg.payload.state); > + msg.size = sizeof(msg.payload.state); > + send_vhost_message(connfd, &msg); > + break; > + > + case VHOST_USER_SET_VRING_KICK: > + user_set_vring_kick(ctx, &msg); > + break; > + case VHOST_USER_SET_VRING_CALL: > + user_set_vring_call(ctx, &msg); > + break; > + > + case VHOST_USER_SET_VRING_ERR: > + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); > + break; > + > + default: > + break; > + > + } > +} > + > + > +/** > + * Creates and initialise the vhost server. > + */ > +int > +rte_vhost_driver_register(const char *path) > +{ > + > + struct vhost_server *vserver; > + > + if (g_vhost_server != NULL) > + return -1; > + > + vserver = calloc(sizeof(struct vhost_server), 1); > + /*TODO: all allocation is through DPDK memory allocation */ > + if (vserver == NULL) > + return -1; > + > + fdset_init(&vserver->fdset); > + > + unlink(path); > + > + vserver->listenfd = uds_socket(path); > + if (vserver->listenfd < 0) { > + free(vserver); > + return -1; > + } > + vserver->path = path; > + > + fdset_add(&vserver->fdset, vserver->listenfd, > + vserver_new_vq_conn, NULL, > + (uint64_t)(uintptr_t)vserver); > + > + ops = get_virtio_net_callbacks(); > + > + g_vhost_server = vserver; > + > + return 0; > +} > + > + > +int > +rte_vhost_driver_session_start(void) > +{ > + fdset_event_dispatch(&g_vhost_server->fdset); > + return 0; > +} > + > diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.h b/lib/librte_vhost/vhost-user/vhost-net-user.h > new file mode 100644 > index 0000000..c9df9fa > --- /dev/null > +++ b/lib/librte_vhost/vhost-user/vhost-net-user.h > @@ -0,0 +1,74 @@ > +#ifndef _VHOST_NET_USER_H > +#define _VHOST_NET_USER_H > +#include <stdint.h> > +#include <linux/vhost.h> > + > +#include "fd_man.h" > + > +struct vhost_server { > + const char *path; /**< The path the uds is bind to. */ > + int listenfd; /**< The listener sockfd. */ > + struct fdset fdset; /**< The fd list this vhost server manages. */ > +}; > + > +/*********** FROM hw/virtio/vhost-user.c *************************************/ > + > +#define VHOST_MEMORY_MAX_NREGIONS 8 > + > +typedef enum VhostUserRequest { > + VHOST_USER_NONE = 0, > + VHOST_USER_GET_FEATURES = 1, > + VHOST_USER_SET_FEATURES = 2, > + VHOST_USER_SET_OWNER = 3, > + VHOST_USER_RESET_OWNER = 4, > + VHOST_USER_SET_MEM_TABLE = 5, > + VHOST_USER_SET_LOG_BASE = 6, > + VHOST_USER_SET_LOG_FD = 7, > + VHOST_USER_SET_VRING_NUM = 8, > + VHOST_USER_SET_VRING_ADDR = 9, > + VHOST_USER_SET_VRING_BASE = 10, > + VHOST_USER_GET_VRING_BASE = 11, > + VHOST_USER_SET_VRING_KICK = 12, > + VHOST_USER_SET_VRING_CALL = 13, > + VHOST_USER_SET_VRING_ERR = 14, > + VHOST_USER_MAX > +} VhostUserRequest; > + > +typedef struct VhostUserMemoryRegion { > + uint64_t guest_phys_addr; > + uint64_t memory_size; > + uint64_t userspace_addr; > + uint64_t mmap_offset; > +} VhostUserMemoryRegion; > + > +typedef struct VhostUserMemory { > + uint32_t nregions; > + uint32_t padding; > + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; > +} VhostUserMemory; > + > +typedef struct VhostUserMsg { > + VhostUserRequest request; > + > +#define VHOST_USER_VERSION_MASK (0x3) > +#define VHOST_USER_REPLY_MASK (0x1 << 2) > + uint32_t flags; > + uint32_t size; /* the following payload size */ > + union { > +#define VHOST_USER_VRING_IDX_MASK (0xff) > +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) > + uint64_t u64; > + struct vhost_vring_state state; > + struct vhost_vring_addr addr; > + VhostUserMemory memory; > + } payload; > + int fds[VHOST_MEMORY_MAX_NREGIONS]; > +} __attribute__((packed)) VhostUserMsg; > + > +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64) > + > +/* The version of the protocol we support */ > +#define VHOST_USER_VERSION (0x1) > + > +/*****************************************************************************/ > +#endif > diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.c b/lib/librte_vhost/vhost-user/virtio-net-user.c > new file mode 100644 > index 0000000..f38e6cc > --- /dev/null > +++ b/lib/librte_vhost/vhost-user/virtio-net-user.c > @@ -0,0 +1,208 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include <stdint.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <sys/mman.h> > + > +#include <rte_log.h> > + > +#include "virtio-net-user.h" > +#include "vhost-net-user.h" > +#include "vhost-net.h" > + > +extern const struct vhost_net_device_ops *ops; > + > +#if 0 > +int > +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) > +{ > + unsigned int idx; > + struct VhostUserMemory memory = pmsg->payload.memory; > + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; > + uint64_t mapped_address, base_address = 0, mem_size = 0; > + > + for (idx = 0; idx < memory.nregions; idx++) { > + if (memory.regions[idx].guest_phys_addr == 0) > + base_address = memory.regions[idx].userspace_addr; > + } > + if (base_address == 0) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "couldn't find the mem region whose gpa is 0.\n"); > + return -1; > + } > + > + for (idx = 0; idx < memory.nregions; idx++) { > + uint64_t size = memory.regions[idx].userspace_addr - > + base_address + memory.regions[idx].memory_size; > + if (mem_size < size) > + mem_size = size; > + } > + > + /* > + * here we assume qemu will map only one file for memory allocation, > + * we only use fds[0] with offset 0. > + */ > + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mem_size, > + PROT_READ | PROT_WRITE, MAP_SHARED, pmsg->fds[0], 0); > + > + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { > + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); > + return -1; > + } > + > + for (idx = 0; idx < memory.nregions; idx++) { > + regions[idx].guest_phys_address = > + memory.regions[idx].guest_phys_addr; > + regions[idx].guest_phys_address_end = > + memory.regions[idx].guest_phys_addr + > + memory.regions[idx].memory_size; > + regions[idx].memory_size = memory.regions[idx].memory_size; > + regions[idx].userspace_address = > + memory.regions[idx].userspace_addr; > + > + regions[idx].address_offset = mapped_address - base_address + > + regions[idx].userspace_address - > + regions[idx].guest_phys_address; > + LOG_DEBUG(VHOST_CONFIG, > + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", > + idx, > + (void *)(uintptr_t)regions[idx].guest_phys_address, > + (void *)(uintptr_t)regions[idx].userspace_address, > + regions[idx].memory_size); > + } > + ops->set_mem_table(ctx, regions, memory.nregions); > + return 0; > +} > + > +#else > + > +int > +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) > +{ > + unsigned int idx; > + struct VhostUserMemory memory = pmsg->payload.memory; > + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; > + uint64_t mapped_address, base_address = 0; > + > + for (idx = 0; idx < memory.nregions; idx++) { > + if (memory.regions[idx].guest_phys_addr == 0) > + base_address = memory.regions[idx].userspace_addr; > + } > + if (base_address == 0) { > + RTE_LOG(ERR, VHOST_CONFIG, > + "couldn't find the mem region whose gpa is 0.\n"); > + return -1; > + } > + > + > + for (idx = 0; idx < memory.nregions; idx++) { > + regions[idx].guest_phys_address = > + memory.regions[idx].guest_phys_addr; > + regions[idx].guest_phys_address_end = > + memory.regions[idx].guest_phys_addr + > + memory.regions[idx].memory_size; > + regions[idx].memory_size = memory.regions[idx].memory_size; > + regions[idx].userspace_address = > + memory.regions[idx].userspace_addr; > +/* > + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, > + regions[idx].memory_size, > + PROT_READ | PROT_WRITE, MAP_SHARED, > + pmsg->fds[idx], > + memory.regions[idx].mmap_offset); > +*/ > + > +/* This is ugly */ > + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, > + regions[idx].memory_size + > + memory.regions[idx].mmap_offset, > + PROT_READ | PROT_WRITE, MAP_SHARED, > + pmsg->fds[idx], > + 0); > + printf("mapped to %p\n", (void *)mapped_address); > + > + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { > + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); > + return -1; > + } > + > +// printf("ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset + 0x3FFFFFFF) & ~0x3FFFFFFF)); > +// printf("unaligned ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset ) )); > + mapped_address += memory.regions[idx].mmap_offset; > + > + regions[idx].address_offset = mapped_address - > + regions[idx].guest_phys_address; > + LOG_DEBUG(VHOST_CONFIG, > + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", > + idx, > + (void *)(uintptr_t)regions[idx].guest_phys_address, > + (void *)(uintptr_t)regions[idx].userspace_address, > + regions[idx].memory_size); > + } > + ops->set_mem_table(ctx, regions, memory.nregions); > + return 0; > +} > + > + > + > + > +#endif > + > + > +void > +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) > +{ > + struct vhost_vring_file file; > + > + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; > + file.fd = pmsg->fds[0]; > + RTE_LOG(INFO, VHOST_CONFIG, > + "vring call idx:%d file:%d\n", file.index, file.fd); > + ops->set_vring_call(ctx, &file); > +} > + > + > +void > +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) > +{ > + struct vhost_vring_file file; > + > + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; > + file.fd = pmsg->fds[0]; > + RTE_LOG(INFO, VHOST_CONFIG, > + "vring kick idx:%d file:%d\n", file.index, file.fd); > + ops->set_vring_kick(ctx, &file); > +} > diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.h b/lib/librte_vhost/vhost-user/virtio-net-user.h > new file mode 100644 > index 0000000..0969376 > --- /dev/null > +++ b/lib/librte_vhost/vhost-user/virtio-net-user.h > @@ -0,0 +1,11 @@ > +#ifndef _VIRTIO_NET_USER_H > +#define _VIRTIO_NET_USER_H > + > +#include "vhost-net.h" > +#include "vhost-net-user.h" > + > +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *); > +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *); > +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *); > + > +#endif > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c > index ccfd82f..8ff0301 100644 > --- a/lib/librte_vhost/vhost_rxtx.c > +++ b/lib/librte_vhost/vhost_rxtx.c > @@ -38,19 +38,14 @@ > #include <rte_memcpy.h> > #include <rte_virtio_net.h> > > -#include "vhost-net-cdev.h" > +#include "vhost-net.h" > > -#define MAX_PKT_BURST 32 > +#define VHOST_MAX_PKT_BURST 64 > +#define VHOST_MAX_MRG_PKT_BURST 64 > > -/** > - * This function adds buffers to the virtio devices RX virtqueue. Buffers can > - * be received from the physical port or from another virtio device. A packet > - * count is returned to indicate the number of packets that are succesfully > - * added to the RX queue. This function works when mergeable is disabled. > - */ > -static inline uint32_t __attribute__((always_inline)) > -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, > - struct rte_mbuf **pkts, uint32_t count) > + > +uint32_t > +rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count) > { > struct vhost_virtqueue *vq; > struct vring_desc *desc; > @@ -59,26 +54,23 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, > struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; > uint64_t buff_addr = 0; > uint64_t buff_hdr_addr = 0; > - uint32_t head[MAX_PKT_BURST], packet_len = 0; > + uint32_t head[VHOST_MAX_PKT_BURST], packet_len = 0; > uint32_t head_idx, packet_success = 0; > + uint32_t mergeable, mrg_count = 0; > uint16_t avail_idx, res_cur_idx; > uint16_t res_base_idx, res_end_idx; > uint16_t free_entries; > uint8_t success = 0; > > - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s()\n", dev->device_fh, __func__); > if (unlikely(queue_id != VIRTIO_RXQ)) { > LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); > return 0; > } > > vq = dev->virtqueue[VIRTIO_RXQ]; > - count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; > - > - /* > - * As many data cores may want access to available buffers, > - * they need to be reserved. > - */ > + count = (count > VHOST_MAX_PKT_BURST) ? VHOST_MAX_PKT_BURST : count; > + /* As many data cores may want access to available buffers, they need to be reserved. */ > do { > res_base_idx = vq->last_used_idx_res; > avail_idx = *((volatile uint16_t *)&vq->avail->idx); > @@ -93,21 +85,25 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, > > res_end_idx = res_base_idx + count; > /* vq->last_used_idx_res is atomically updated. */ > - /* TODO: Allow to disable cmpset if no concurrency in application. */ > + /* TODO: Allow to disable cmpset if no concurrency in application */ > success = rte_atomic16_cmpset(&vq->last_used_idx_res, > res_base_idx, res_end_idx); > + /* If there is contention here and failed, try again. */ > } while (unlikely(success == 0)); > res_cur_idx = res_base_idx; > LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", > - dev->device_fh, res_cur_idx, res_end_idx); > + dev->device_fh, > + res_cur_idx, res_end_idx); > > /* Prefetch available ring to retrieve indexes. */ > rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); > > + /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ > + mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); > + > /* Retrieve all of the head indexes first to avoid caching issues. */ > for (head_idx = 0; head_idx < count; head_idx++) > - head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & > - (vq->size - 1)]; > + head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; > > /*Prefetch descriptor index. */ > rte_prefetch0(&vq->desc[head[packet_success]]); > @@ -123,46 +119,57 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, > /* Prefetch buffer address. */ > rte_prefetch0((void *)(uintptr_t)buff_addr); > > - /* Copy virtio_hdr to packet and increment buffer address */ > - buff_hdr_addr = buff_addr; > - packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; > - > - /* > - * If the descriptors are chained the header and data are > - * placed in separate buffers. > - */ > - if (desc->flags & VRING_DESC_F_NEXT) { > - desc->len = vq->vhost_hlen; > - desc = &vq->desc[desc->next]; > - /* Buffer address translation. */ > - buff_addr = gpa_to_vva(dev, desc->addr); > - desc->len = rte_pktmbuf_data_len(buff); > + if (mergeable && (mrg_count != 0)) { > + desc->len = packet_len = rte_pktmbuf_data_len(buff); > } else { > - buff_addr += vq->vhost_hlen; > - desc->len = packet_len; > + /* Copy virtio_hdr to packet and increment buffer address */ > + buff_hdr_addr = buff_addr; > + packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; > + > + /* > + * If the descriptors are chained the header and data are placed in > + * separate buffers. > + */ > + if (desc->flags & VRING_DESC_F_NEXT) { > + desc->len = vq->vhost_hlen; > + desc = &vq->desc[desc->next]; > + /* Buffer address translation. */ > + buff_addr = gpa_to_vva(dev, desc->addr); > + desc->len = rte_pktmbuf_data_len(buff); > + } else { > + buff_addr += vq->vhost_hlen; > + desc->len = packet_len; > + } > } > > + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0); > + > /* Update used ring with desc information */ > - vq->used->ring[res_cur_idx & (vq->size - 1)].id = > - head[packet_success]; > + vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; > vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; > > /* Copy mbuf data to buffer */ > - /* FIXME for sg mbuf and the case that desc couldn't hold the mbuf data */ > - rte_memcpy((void *)(uintptr_t)buff_addr, > - rte_pktmbuf_mtod(buff, const void *), > - rte_pktmbuf_data_len(buff)); > - PRINT_PACKET(dev, (uintptr_t)buff_addr, > - rte_pktmbuf_data_len(buff), 0); > + /* TODO fixme for sg mbuf and the case that desc couldn't hold the mbuf data */ > + rte_memcpy((void *)(uintptr_t)buff_addr, (const void *)buff->pkt.data, rte_pktmbuf_data_len(buff)); > > res_cur_idx++; > packet_success++; > > - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, > - (const void *)&virtio_hdr, vq->vhost_hlen); > - > - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); > - > + /* If mergeable is disabled then a header is required per buffer. */ > + if (!mergeable) { > + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); > + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); > + } else { > + mrg_count++; > + /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */ > + if ((mrg_count == VHOST_MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) { > + virtio_hdr.num_buffers = mrg_count; > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); > + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); > + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); > + mrg_count = 0; > + } > + } > if (res_cur_idx < res_end_idx) { > /* Prefetch descriptor index. */ > rte_prefetch0(&vq->desc[head[packet_success]]); > @@ -184,357 +191,18 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, > return count; > } > > -static inline uint32_t __attribute__((always_inline)) > -copy_from_mbuf_to_vring(struct virtio_net *dev, uint16_t res_base_idx, > - uint16_t res_end_idx, struct rte_mbuf *pkt) > -{ > - uint32_t vec_idx = 0; > - uint32_t entry_success = 0; > - struct vhost_virtqueue *vq; > - /* The virtio_hdr is initialised to 0. */ > - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = { > - {0, 0, 0, 0, 0, 0}, 0}; > - uint16_t cur_idx = res_base_idx; > - uint64_t vb_addr = 0; > - uint64_t vb_hdr_addr = 0; > - uint32_t seg_offset = 0; > - uint32_t vb_offset = 0; > - uint32_t seg_avail; > - uint32_t vb_avail; > - uint32_t cpy_len, entry_len; > - > - if (pkt == NULL) > - return 0; > - > - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " > - "End Index %d\n", > - dev->device_fh, cur_idx, res_end_idx); > - > - /* > - * Convert from gpa to vva > - * (guest physical addr -> vhost virtual addr) > - */ > - vq = dev->virtqueue[VIRTIO_RXQ]; > - vb_addr = > - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); > - vb_hdr_addr = vb_addr; > - > - /* Prefetch buffer address. */ > - rte_prefetch0((void *)(uintptr_t)vb_addr); > - > - virtio_hdr.num_buffers = res_end_idx - res_base_idx; > - > - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", > - dev->device_fh, virtio_hdr.num_buffers); > > - rte_memcpy((void *)(uintptr_t)vb_hdr_addr, > - (const void *)&virtio_hdr, vq->vhost_hlen); > - > - PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); > - > - seg_avail = rte_pktmbuf_data_len(pkt); > - vb_offset = vq->vhost_hlen; > - vb_avail = > - vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; > - > - entry_len = vq->vhost_hlen; > - > - if (vb_avail == 0) { > - uint32_t desc_idx = > - vq->buf_vec[vec_idx].desc_idx; > - vq->desc[desc_idx].len = vq->vhost_hlen; > - > - if ((vq->desc[desc_idx].flags > - & VRING_DESC_F_NEXT) == 0) { > - /* Update used ring with desc information */ > - vq->used->ring[cur_idx & (vq->size - 1)].id > - = vq->buf_vec[vec_idx].desc_idx; > - vq->used->ring[cur_idx & (vq->size - 1)].len > - = entry_len; > - > - entry_len = 0; > - cur_idx++; > - entry_success++; > - } > - > - vec_idx++; > - vb_addr = > - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); > - > - /* Prefetch buffer address. */ > - rte_prefetch0((void *)(uintptr_t)vb_addr); > - vb_offset = 0; > - vb_avail = vq->buf_vec[vec_idx].buf_len; > - } > - > - cpy_len = RTE_MIN(vb_avail, seg_avail); > - > - while (cpy_len > 0) { > - /* Copy mbuf data to vring buffer */ > - rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), > - (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset), > - cpy_len); > - > - PRINT_PACKET(dev, > - (uintptr_t)(vb_addr + vb_offset), > - cpy_len, 0); > - > - seg_offset += cpy_len; > - vb_offset += cpy_len; > - seg_avail -= cpy_len; > - vb_avail -= cpy_len; > - entry_len += cpy_len; > - > - if (seg_avail != 0) { > - /* > - * The virtio buffer in this vring > - * entry reach to its end. > - * But the segment doesn't complete. > - */ > - if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & > - VRING_DESC_F_NEXT) == 0) { > - /* Update used ring with desc information */ > - vq->used->ring[cur_idx & (vq->size - 1)].id > - = vq->buf_vec[vec_idx].desc_idx; > - vq->used->ring[cur_idx & (vq->size - 1)].len > - = entry_len; > - entry_len = 0; > - cur_idx++; > - entry_success++; > - } > - > - vec_idx++; > - vb_addr = gpa_to_vva(dev, > - vq->buf_vec[vec_idx].buf_addr); > - vb_offset = 0; > - vb_avail = vq->buf_vec[vec_idx].buf_len; > - cpy_len = RTE_MIN(vb_avail, seg_avail); > - } else { > - /* > - * This current segment complete, need continue to > - * check if the whole packet complete or not. > - */ > - pkt = pkt->next; > - if (pkt != NULL) { > - /* > - * There are more segments. > - */ > - if (vb_avail == 0) { > - /* > - * This current buffer from vring is > - * used up, need fetch next buffer > - * from buf_vec. > - */ > - uint32_t desc_idx = > - vq->buf_vec[vec_idx].desc_idx; > - vq->desc[desc_idx].len = vb_offset; > - > - if ((vq->desc[desc_idx].flags & > - VRING_DESC_F_NEXT) == 0) { > - uint16_t wrapped_idx = > - cur_idx & (vq->size - 1); > - /* > - * Update used ring with the > - * descriptor information > - */ > - vq->used->ring[wrapped_idx].id > - = desc_idx; > - vq->used->ring[wrapped_idx].len > - = entry_len; > - entry_success++; > - entry_len = 0; > - cur_idx++; > - } > - > - /* Get next buffer from buf_vec. */ > - vec_idx++; > - vb_addr = gpa_to_vva(dev, > - vq->buf_vec[vec_idx].buf_addr); > - vb_avail = > - vq->buf_vec[vec_idx].buf_len; > - vb_offset = 0; > - } > - > - seg_offset = 0; > - seg_avail = rte_pktmbuf_data_len(pkt); > - cpy_len = RTE_MIN(vb_avail, seg_avail); > - } else { > - /* > - * This whole packet completes. > - */ > - uint32_t desc_idx = > - vq->buf_vec[vec_idx].desc_idx; > - vq->desc[desc_idx].len = vb_offset; > - > - while (vq->desc[desc_idx].flags & > - VRING_DESC_F_NEXT) { > - desc_idx = vq->desc[desc_idx].next; > - vq->desc[desc_idx].len = 0; > - } > - > - /* Update used ring with desc information */ > - vq->used->ring[cur_idx & (vq->size - 1)].id > - = vq->buf_vec[vec_idx].desc_idx; > - vq->used->ring[cur_idx & (vq->size - 1)].len > - = entry_len; > - entry_len = 0; > - cur_idx++; > - entry_success++; > - seg_avail = 0; > - cpy_len = RTE_MIN(vb_avail, seg_avail); > - } > - } > - } > - > - return entry_success; > -} > - > -/* > - * This function works for mergeable RX. > - */ > -static inline uint32_t __attribute__((always_inline)) > -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, > - struct rte_mbuf **pkts, uint32_t count) > +uint32_t > +rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) > { > - struct vhost_virtqueue *vq; > - uint32_t pkt_idx = 0, entry_success = 0; > - uint16_t avail_idx, res_cur_idx; > - uint16_t res_base_idx, res_end_idx; > - uint8_t success = 0; > - > - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", > - dev->device_fh); > - if (unlikely(queue_id != VIRTIO_RXQ)) { > - LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); > - } > - > - vq = dev->virtqueue[VIRTIO_RXQ]; > - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); > - > - if (count == 0) > - return 0; > - > - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { > - uint32_t secure_len = 0; > - uint16_t need_cnt; > - uint32_t vec_idx = 0; > - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen; > - uint16_t i, id; > - > - do { > - /* > - * As many data cores may want access to available > - * buffers, they need to be reserved. > - */ > - res_base_idx = vq->last_used_idx_res; > - res_cur_idx = res_base_idx; > - > - do { > - avail_idx = *((volatile uint16_t *)&vq->avail->idx); > - if (unlikely(res_cur_idx == avail_idx)) { > - LOG_DEBUG(VHOST_DATA, > - "(%"PRIu64") Failed " > - "to get enough desc from " > - "vring\n", > - dev->device_fh); > - return pkt_idx; > - } else { > - uint16_t wrapped_idx = > - (res_cur_idx) & (vq->size - 1); > - uint32_t idx = > - vq->avail->ring[wrapped_idx]; > - uint8_t next_desc; > - > - do { > - next_desc = 0; > - secure_len += vq->desc[idx].len; > - if (vq->desc[idx].flags & > - VRING_DESC_F_NEXT) { > - idx = vq->desc[idx].next; > - next_desc = 1; > - } > - } while (next_desc); > - > - res_cur_idx++; > - } > - } while (pkt_len > secure_len); > - > - /* vq->last_used_idx_res is atomically updated. */ > - success = rte_atomic16_cmpset(&vq->last_used_idx_res, > - res_base_idx, > - res_cur_idx); > - } while (success == 0); > - > - id = res_base_idx; > - need_cnt = res_cur_idx - res_base_idx; > - > - for (i = 0; i < need_cnt; i++, id++) { > - uint16_t wrapped_idx = id & (vq->size - 1); > - uint32_t idx = vq->avail->ring[wrapped_idx]; > - uint8_t next_desc; > - do { > - next_desc = 0; > - vq->buf_vec[vec_idx].buf_addr = > - vq->desc[idx].addr; > - vq->buf_vec[vec_idx].buf_len = > - vq->desc[idx].len; > - vq->buf_vec[vec_idx].desc_idx = idx; > - vec_idx++; > - > - if (vq->desc[idx].flags & VRING_DESC_F_NEXT) { > - idx = vq->desc[idx].next; > - next_desc = 1; > - } > - } while (next_desc); > - } > - > - res_end_idx = res_cur_idx; > - > - entry_success = copy_from_mbuf_to_vring(dev, res_base_idx, > - res_end_idx, pkts[pkt_idx]); > - > - rte_compiler_barrier(); > - > - /* > - * Wait until it's our turn to add our buffer > - * to the used ring. > - */ > - while (unlikely(vq->last_used_idx != res_base_idx)) > - rte_pause(); > - > - *(volatile uint16_t *)&vq->used->idx += entry_success; > - vq->last_used_idx = res_end_idx; > - > - /* Kick the guest if necessary. */ > - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) > - eventfd_write((int)vq->kickfd, 1); > - } > - > - return count; > -} > - > -uint16_t > -rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, > - struct rte_mbuf **pkts, uint16_t count) > -{ > - if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))) > - return virtio_dev_merge_rx(dev, queue_id, pkts, count); > - else > - return virtio_dev_rx(dev, queue_id, pkts, count); > -} > - > -uint16_t > -rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, > - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) > -{ > - struct rte_mbuf *m, *prev; > + struct rte_mbuf *mbuf; > struct vhost_virtqueue *vq; > struct vring_desc *desc; > - uint64_t vb_addr = 0; > - uint32_t head[MAX_PKT_BURST]; > + uint64_t buff_addr = 0; > + uint32_t head[VHOST_MAX_PKT_BURST]; > uint32_t used_idx; > uint32_t i; > - uint16_t free_entries, entry_success = 0; > + uint16_t free_entries, packet_success = 0; > uint16_t avail_idx; > > if (unlikely(queue_id != VIRTIO_TXQ)) { > @@ -549,8 +217,8 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, > if (vq->last_used_idx == avail_idx) > return 0; > > - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, > - dev->device_fh); > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s(%d->%d)\n", > + dev->device_fh, __func__, vq->last_used_idx, avail_idx); > > /* Prefetch available ring to retrieve head indexes. */ > rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); > @@ -558,173 +226,68 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, > /*get the number of free entries in the ring*/ > free_entries = (avail_idx - vq->last_used_idx); > > - free_entries = RTE_MIN(free_entries, count); > + if (free_entries > count) > + free_entries = count; > /* Limit to MAX_PKT_BURST. */ > - free_entries = RTE_MIN(free_entries, MAX_PKT_BURST); > + if (free_entries > VHOST_MAX_PKT_BURST) > + free_entries = VHOST_MAX_PKT_BURST; > > - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", > - dev->device_fh, free_entries); > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries); > /* Retrieve all of the head indexes first to avoid caching issues. */ > for (i = 0; i < free_entries; i++) > head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; > > /* Prefetch descriptor index. */ > - rte_prefetch0(&vq->desc[head[entry_success]]); > + rte_prefetch0(&vq->desc[head[packet_success]]); > rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); > > - while (entry_success < free_entries) { > - uint32_t vb_avail, vb_offset; > - uint32_t seg_avail, seg_offset; > - uint32_t cpy_len; > - uint32_t seg_num = 0; > - struct rte_mbuf *cur; > - uint8_t alloc_err = 0; > - > - desc = &vq->desc[head[entry_success]]; > + while (packet_success < free_entries) { > + desc = &vq->desc[head[packet_success]]; > > /* Discard first buffer as it is the virtio header */ > desc = &vq->desc[desc->next]; > > /* Buffer address translation. */ > - vb_addr = gpa_to_vva(dev, desc->addr); > + buff_addr = gpa_to_vva(dev, desc->addr); > /* Prefetch buffer address. */ > - rte_prefetch0((void *)(uintptr_t)vb_addr); > + rte_prefetch0((void *)(uintptr_t)buff_addr); > > used_idx = vq->last_used_idx & (vq->size - 1); > > - if (entry_success < (free_entries - 1)) { > + if (packet_success < (free_entries - 1)) { > /* Prefetch descriptor index. */ > - rte_prefetch0(&vq->desc[head[entry_success+1]]); > + rte_prefetch0(&vq->desc[head[packet_success+1]]); > rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); > } > > /* Update used index buffer information. */ > - vq->used->ring[used_idx].id = head[entry_success]; > + vq->used->ring[used_idx].id = head[packet_success]; > vq->used->ring[used_idx].len = 0; > > - vb_offset = 0; > - vb_avail = desc->len; > - /* Allocate an mbuf and populate the structure. */ > - m = rte_pktmbuf_alloc(mbuf_pool); > - if (unlikely(m == NULL)) { > - RTE_LOG(ERR, VHOST_DATA, > - "Failed to allocate memory for mbuf.\n"); > - return entry_success; > + mbuf = rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(mbuf == NULL)) { > + RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); > + return packet_success; > } > - seg_offset = 0; > - seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; > - cpy_len = RTE_MIN(vb_avail, seg_avail); > - > - PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); > - > - seg_num++; > - cur = m; > - prev = m; > - while (cpy_len != 0) { > - rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset), > - (void *)((uintptr_t)(vb_addr + vb_offset)), > - cpy_len); > - > - seg_offset += cpy_len; > - vb_offset += cpy_len; > - vb_avail -= cpy_len; > - seg_avail -= cpy_len; > - > - if (vb_avail != 0) { > - /* > - * The segment reachs to its end, > - * while the virtio buffer in TX vring has > - * more data to be copied. > - */ > - cur->data_len = seg_offset; > - m->pkt_len += seg_offset; > - /* Allocate mbuf and populate the structure. */ > - cur = rte_pktmbuf_alloc(mbuf_pool); > - if (unlikely(cur == NULL)) { > - RTE_LOG(ERR, VHOST_DATA, "Failed to " > - "allocate memory for mbuf.\n"); > - rte_pktmbuf_free(m); > - alloc_err = 1; > - break; > - } > - > - seg_num++; > - prev->next = cur; > - prev = cur; > - seg_offset = 0; > - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; > - } else { > - if (desc->flags & VRING_DESC_F_NEXT) { > - /* > - * There are more virtio buffers in > - * same vring entry need to be copied. > - */ > - if (seg_avail == 0) { > - /* > - * The current segment hasn't > - * room to accomodate more > - * data. > - */ > - cur->data_len = seg_offset; > - m->pkt_len += seg_offset; > - /* > - * Allocate an mbuf and > - * populate the structure. > - */ > - cur = rte_pktmbuf_alloc(mbuf_pool); > - if (unlikely(cur == NULL)) { > - RTE_LOG(ERR, > - VHOST_DATA, > - "Failed to " > - "allocate memory " > - "for mbuf\n"); > - rte_pktmbuf_free(m); > - alloc_err = 1; > - break; > - } > - seg_num++; > - prev->next = cur; > - prev = cur; > - seg_offset = 0; > - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; > - } > - > - desc = &vq->desc[desc->next]; > - > - /* Buffer address translation. */ > - vb_addr = gpa_to_vva(dev, desc->addr); > - /* Prefetch buffer address. */ > - rte_prefetch0((void *)(uintptr_t)vb_addr); > - vb_offset = 0; > - vb_avail = desc->len; > - > - PRINT_PACKET(dev, (uintptr_t)vb_addr, > - desc->len, 0); > - } else { > - /* The whole packet completes. */ > - cur->data_len = seg_offset; > - m->pkt_len += seg_offset; > - vb_avail = 0; > - } > - } > + mbuf->pkt.data_len = desc->len; > + mbuf->pkt.pkt_len = mbuf->pkt.data_len; > > - cpy_len = RTE_MIN(vb_avail, seg_avail); > - } > + rte_memcpy((void *) mbuf->pkt.data, > + (const void *) buff_addr, mbuf->pkt.data_len); > > - if (unlikely(alloc_err == 1)) > - break; > + pkts[packet_success] = mbuf; > > - m->nb_segs = seg_num; > + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); > > - pkts[entry_success] = m; > vq->last_used_idx++; > - entry_success++; > + packet_success++; > } > > rte_compiler_barrier(); > - vq->used->idx += entry_success; > + vq->used->idx += packet_success; > /* Kick guest if required. */ > if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) > eventfd_write((int)vq->kickfd, 1); > - return entry_success; > + > + return packet_success; > } > diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c > index 852b6d1..516e743 100644 > --- a/lib/librte_vhost/virtio-net.c > +++ b/lib/librte_vhost/virtio-net.c > @@ -31,17 +31,14 @@ > * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > */ > > -#include <dirent.h> > -#include <fuse/cuse_lowlevel.h> > #include <linux/vhost.h> > #include <linux/virtio_net.h> > #include <stddef.h> > #include <stdint.h> > #include <stdlib.h> > -#include <sys/eventfd.h> > -#include <sys/ioctl.h> > #include <sys/mman.h> > #include <unistd.h> > +#include <assert.h> > > #include <rte_ethdev.h> > #include <rte_log.h> > @@ -49,10 +46,8 @@ > #include <rte_memory.h> > #include <rte_virtio_net.h> > > -#include "vhost-net-cdev.h" > -#include "eventfd_link/eventfd_link.h" > - > -/* > +#include "vhost-net.h" > +/** > * Device linked list structure for configuration. > */ > struct virtio_net_config_ll { > @@ -60,38 +55,15 @@ struct virtio_net_config_ll { > struct virtio_net_config_ll *next; /* Next dev on linked list.*/ > }; > > -const char eventfd_cdev[] = "/dev/eventfd-link"; > - > -/* device ops to add/remove device to/from data core. */ > +/* device ops to add/remove device to data core. */ > static struct virtio_net_device_ops const *notify_ops; > -/* root address of the linked list of managed virtio devices */ > +/* root address of the linked list in the configuration core. */ > static struct virtio_net_config_ll *ll_root; > > /* Features supported by this lib. */ > -#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ > - (1ULL << VIRTIO_NET_F_CTRL_RX)) > +#define VHOST_SUPPORTED_FEATURES (1ULL << VIRTIO_NET_F_MRG_RXBUF) > static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; > > -/* Line size for reading maps file. */ > -static const uint32_t BUFSIZE = PATH_MAX; > - > -/* Size of prot char array in procmap. */ > -#define PROT_SZ 5 > - > -/* Number of elements in procmap struct. */ > -#define PROCMAP_SZ 8 > - > -/* Structure containing information gathered from maps file. */ > -struct procmap { > - uint64_t va_start; /* Start virtual address in file. */ > - uint64_t len; /* Size of file. */ > - uint64_t pgoff; /* Not used. */ > - uint32_t maj; /* Not used. */ > - uint32_t min; /* Not used. */ > - uint32_t ino; /* Not used. */ > - char prot[PROT_SZ]; /* Not used. */ > - char fname[PATH_MAX]; /* File name. */ > -}; > > /* > * Converts QEMU virtual address to Vhost virtual address. This function is > @@ -110,199 +82,15 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) > if ((qemu_va >= region->userspace_address) && > (qemu_va <= region->userspace_address + > region->memory_size)) { > - vhost_va = dev->mem->mapped_address + qemu_va - > - dev->mem->base_address; > + vhost_va = qemu_va + region->guest_phys_address + > + region->address_offset - > + region->userspace_address; > break; > } > } > return vhost_va; > } > > -/* > - * Locate the file containing QEMU's memory space and > - * map it to our address space. > - */ > -static int > -host_memory_map(struct virtio_net *dev, struct virtio_memory *mem, > - pid_t pid, uint64_t addr) > -{ > - struct dirent *dptr = NULL; > - struct procmap procmap; > - DIR *dp = NULL; > - int fd; > - int i; > - char memfile[PATH_MAX]; > - char mapfile[PATH_MAX]; > - char procdir[PATH_MAX]; > - char resolved_path[PATH_MAX]; > - char *path = NULL; > - FILE *fmap; > - void *map; > - uint8_t found = 0; > - char line[BUFSIZE]; > - char dlm[] = "- : "; > - char *str, *sp, *in[PROCMAP_SZ]; > - char *end = NULL; > - > - /* Path where mem files are located. */ > - snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); > - /* Maps file used to locate mem file. */ > - snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); > - > - fmap = fopen(mapfile, "r"); > - if (fmap == NULL) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") Failed to open maps file for pid %d\n", > - dev->device_fh, pid); > - return -1; > - } > - > - /* Read through maps file until we find out base_address. */ > - while (fgets(line, BUFSIZE, fmap) != 0) { > - str = line; > - errno = 0; > - /* Split line into fields. */ > - for (i = 0; i < PROCMAP_SZ; i++) { > - in[i] = strtok_r(str, &dlm[i], &sp); > - if ((in[i] == NULL) || (errno != 0)) { > - fclose(fmap); > - return -1; > - } > - str = NULL; > - } > - > - /* Convert/Copy each field as needed. */ > - procmap.va_start = strtoull(in[0], &end, 16); > - if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || > - (errno != 0)) { > - fclose(fmap); > - return -1; > - } > - > - procmap.len = strtoull(in[1], &end, 16); > - if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || > - (errno != 0)) { > - fclose(fmap); > - return -1; > - } > - > - procmap.pgoff = strtoull(in[3], &end, 16); > - if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || > - (errno != 0)) { > - fclose(fmap); > - return -1; > - } > - > - procmap.maj = strtoul(in[4], &end, 16); > - if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || > - (errno != 0)) { > - fclose(fmap); > - return -1; > - } > - > - procmap.min = strtoul(in[5], &end, 16); > - if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || > - (errno != 0)) { > - fclose(fmap); > - return -1; > - } > - > - procmap.ino = strtoul(in[6], &end, 16); > - if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || > - (errno != 0)) { > - fclose(fmap); > - return -1; > - } > - > - memcpy(&procmap.prot, in[2], PROT_SZ); > - memcpy(&procmap.fname, in[7], PATH_MAX); > - > - if (procmap.va_start == addr) { > - procmap.len = procmap.len - procmap.va_start; > - found = 1; > - break; > - } > - } > - fclose(fmap); > - > - if (!found) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") Failed to find memory file in pid %d maps file\n", > - dev->device_fh, pid); > - return -1; > - } > - > - /* Find the guest memory file among the process fds. */ > - dp = opendir(procdir); > - if (dp == NULL) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") Cannot open pid %d process directory\n", > - dev->device_fh, pid); > - return -1; > - } > - > - found = 0; > - > - /* Read the fd directory contents. */ > - while (NULL != (dptr = readdir(dp))) { > - snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", > - pid, dptr->d_name); > - path = realpath(memfile, resolved_path); > - if ((path == NULL) && (strlen(resolved_path) == 0)) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") Failed to resolve fd directory\n", > - dev->device_fh); > - closedir(dp); > - return -1; > - } > - if (strncmp(resolved_path, procmap.fname, > - strnlen(procmap.fname, PATH_MAX)) == 0) { > - found = 1; > - break; > - } > - } > - > - closedir(dp); > - > - if (found == 0) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") Failed to find memory file for pid %d\n", > - dev->device_fh, pid); > - return -1; > - } > - /* Open the shared memory file and map the memory into this process. */ > - fd = open(memfile, O_RDWR); > - > - if (fd == -1) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") Failed to open %s for pid %d\n", > - dev->device_fh, memfile, pid); > - return -1; > - } > - > - map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE, > - MAP_POPULATE|MAP_SHARED, fd, 0); > - close(fd); > - > - if (map == MAP_FAILED) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") Error mapping the file %s for pid %d\n", > - dev->device_fh, memfile, pid); > - return -1; > - } > - > - /* Store the memory address and size in the device data structure */ > - mem->mapped_address = (uint64_t)(uintptr_t)map; > - mem->mapped_size = procmap.len; > - > - LOG_DEBUG(VHOST_CONFIG, > - "(%"PRIu64") Mem File: %s->%s - Size: %llu - VA: %p\n", > - dev->device_fh, > - memfile, resolved_path, > - (unsigned long long)mem->mapped_size, map); > - > - return 0; > -} > > /* > * Retrieves an entry from the devices configuration linked list. > @@ -376,7 +164,7 @@ add_config_ll_entry(struct virtio_net_config_ll *new_ll_dev) > } > > } > - > +/*TODO dpdk alloc/free if possible */ > /* > * Unmap any memory, close any file descriptors and > * free any memory owned by a device. > @@ -389,16 +177,17 @@ cleanup_device(struct virtio_net *dev) > munmap((void *)(uintptr_t)dev->mem->mapped_address, > (size_t)dev->mem->mapped_size); > free(dev->mem); > + dev->mem = NULL; > } > > /* Close any event notifiers opened by device. */ > - if (dev->virtqueue[VIRTIO_RXQ]->callfd) > + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) > close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); > - if (dev->virtqueue[VIRTIO_RXQ]->kickfd) > + if (dev->virtqueue[VIRTIO_RXQ]->kickfd > 0) > close((int)dev->virtqueue[VIRTIO_RXQ]->kickfd); > - if (dev->virtqueue[VIRTIO_TXQ]->callfd) > + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) > close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); > - if (dev->virtqueue[VIRTIO_TXQ]->kickfd) > + if (dev->virtqueue[VIRTIO_TXQ]->kickfd > 0) > close((int)dev->virtqueue[VIRTIO_TXQ]->kickfd); > } > > @@ -522,8 +311,8 @@ new_device(struct vhost_device_ctx ctx) > } > > /* > - * Function is called from the CUSE release function. This function will > - * cleanup the device and remove it from device configuration linked list. > + * Function is called from the CUSE release function. This function will cleanup > + * the device and remove it from device configuration linked list. > */ > static void > destroy_device(struct vhost_device_ctx ctx) > @@ -569,6 +358,7 @@ set_owner(struct vhost_device_ctx ctx) > return -1; > > return 0; > + /* TODO check ctx.fh is meaningfull here */ > } > > /* > @@ -651,14 +441,12 @@ set_features(struct vhost_device_ctx ctx, uint64_t *pu) > * This includes storing offsets used to translate buffer addresses. > */ > static int > -set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, > - uint32_t nregions) > +set_mem_table(struct vhost_device_ctx ctx, > + const struct virtio_memory_regions *regions, uint32_t nregions) > { > struct virtio_net *dev; > - struct vhost_memory_region *mem_regions; > struct virtio_memory *mem; > - uint64_t size = offsetof(struct vhost_memory, regions); > - uint32_t regionidx, valid_regions; > + uint32_t regionidx; > > dev = get_device(ctx); > if (dev == NULL) > @@ -682,107 +470,24 @@ set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, > > mem->nregions = nregions; > > - mem_regions = (void *)(uintptr_t) > - ((uint64_t)(uintptr_t)mem_regions_addr + size); > - > for (regionidx = 0; regionidx < mem->nregions; regionidx++) { > /* Populate the region structure for each region. */ > - mem->regions[regionidx].guest_phys_address = > - mem_regions[regionidx].guest_phys_addr; > - mem->regions[regionidx].guest_phys_address_end = > - mem->regions[regionidx].guest_phys_address + > - mem_regions[regionidx].memory_size; > - mem->regions[regionidx].memory_size = > - mem_regions[regionidx].memory_size; > - mem->regions[regionidx].userspace_address = > - mem_regions[regionidx].userspace_addr; > - > - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", dev->device_fh, > - regionidx, > - (void *)(uintptr_t)mem->regions[regionidx].guest_phys_address, > - (void *)(uintptr_t)mem->regions[regionidx].userspace_address, > - mem->regions[regionidx].memory_size); > - > - /*set the base address mapping*/ > + mem->regions[regionidx] = regions[regionidx]; > if (mem->regions[regionidx].guest_phys_address == 0x0) { > mem->base_address = > mem->regions[regionidx].userspace_address; > - /* Map VM memory file */ > - if (host_memory_map(dev, mem, ctx.pid, > - mem->base_address) != 0) { > - free(mem); > - return -1; > - } > + mem->mapped_address = > + mem->regions[regionidx].address_offset; > } > } > > - /* Check that we have a valid base address. */ > - if (mem->base_address == 0) { > - RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Failed to find base address of qemu memory file.\n", dev->device_fh); > - free(mem); > - return -1; > - } > - > - /* > - * Check if all of our regions have valid mappings. > - * Usually one does not exist in the QEMU memory file. > - */ > - valid_regions = mem->nregions; > - for (regionidx = 0; regionidx < mem->nregions; regionidx++) { > - if ((mem->regions[regionidx].userspace_address < > - mem->base_address) || > - (mem->regions[regionidx].userspace_address > > - (mem->base_address + mem->mapped_size))) > - valid_regions--; > - } > - > - /* > - * If a region does not have a valid mapping, > - * we rebuild our memory struct to contain only valid entries. > - */ > - if (valid_regions != mem->nregions) { > - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") Not all memory regions exist in the QEMU mem file. Re-populating mem structure\n", > - dev->device_fh); > - > - /* > - * Re-populate the memory structure with only valid regions. > - * Invalid regions are over-written with memmove. > - */ > - valid_regions = 0; > - > - for (regionidx = mem->nregions; 0 != regionidx--;) { > - if ((mem->regions[regionidx].userspace_address < > - mem->base_address) || > - (mem->regions[regionidx].userspace_address > > - (mem->base_address + mem->mapped_size))) { > - memmove(&mem->regions[regionidx], > - &mem->regions[regionidx + 1], > - sizeof(struct virtio_memory_regions) * > - valid_regions); > - } else { > - valid_regions++; > - } > - } > - } > - mem->nregions = valid_regions; > + /*TODO addback the logic that remove invalid memory regions */ > dev->mem = mem; > > - /* > - * Calculate the address offset for each region. > - * This offset is used to identify the vhost virtual address > - * corresponding to a QEMU guest physical address. > - */ > - for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { > - dev->mem->regions[regionidx].address_offset = > - dev->mem->regions[regionidx].userspace_address - > - dev->mem->base_address + > - dev->mem->mapped_address - > - dev->mem->regions[regionidx].guest_phys_address; > - > - } > return 0; > } > > + > /* > * Called from CUSE IOCTL: VHOST_SET_VRING_NUM > * The virtio device sends us the size of the descriptor ring. > @@ -896,38 +601,62 @@ get_vring_base(struct vhost_device_ctx ctx, uint32_t index, > /* State->index refers to the queue index. The txq is 1, rxq is 0. */ > state->num = dev->virtqueue[state->index]->last_used_idx; > > - return 0; > -} > + if (dev->flags & VIRTIO_DEV_RUNNING) { > + RTE_LOG(INFO, VHOST_CONFIG, > + "get_vring_base message is for release\n"); > + notify_ops->destroy_device(dev); > + /* > + * sync call. > + * when it returns, it means it si removed from data core. > + */ > + } > + /* TODO fix all munmap */ > + if (dev->mem) { > + munmap((void *)(uintptr_t)dev->mem->mapped_address, > + (size_t)dev->mem->mapped_size); > + free(dev->mem); > + dev->mem = NULL; > + } > > -/* > - * This function uses the eventfd_link kernel module to copy an eventfd file > - * descriptor provided by QEMU in to our process space. > - */ > -static int > -eventfd_copy(struct virtio_net *dev, struct eventfd_copy *eventfd_copy) > -{ > - int eventfd_link, ret; > > - /* Open the character device to the kernel module. */ > - eventfd_link = open(eventfd_cdev, O_RDWR); > - if (eventfd_link < 0) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") eventfd_link module is not loaded\n", > - dev->device_fh); > - return -1; > - } > + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) > + close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); > + dev->virtqueue[VIRTIO_RXQ]->callfd = -1; > + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) > + close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); > + dev->virtqueue[VIRTIO_TXQ]->callfd = -1; > + /* We don't cleanup callfd here as we willn't get CALLFD again */ > + > + dev->virtqueue[VIRTIO_RXQ]->desc = NULL; > + dev->virtqueue[VIRTIO_RXQ]->avail = NULL; > + dev->virtqueue[VIRTIO_RXQ]->used = NULL; > + dev->virtqueue[VIRTIO_RXQ]->last_used_idx = 0; > + dev->virtqueue[VIRTIO_RXQ]->last_used_idx_res = 0; > + > + dev->virtqueue[VIRTIO_TXQ]->desc = NULL; > + dev->virtqueue[VIRTIO_TXQ]->avail = NULL; > + dev->virtqueue[VIRTIO_TXQ]->used = NULL; > + dev->virtqueue[VIRTIO_TXQ]->last_used_idx = 0; > + dev->virtqueue[VIRTIO_TXQ]->last_used_idx_res = 0; > > - /* Call the IOCTL to copy the eventfd. */ > - ret = ioctl(eventfd_link, EVENTFD_COPY, eventfd_copy); > - close(eventfd_link); > > - if (ret < 0) { > - RTE_LOG(ERR, VHOST_CONFIG, > - "(%"PRIu64") EVENTFD_COPY ioctl failed\n", > - dev->device_fh); > - return -1; > - } > + return 0; > +} > > +static int > +virtio_is_ready(struct virtio_net *dev, int index) > +{ > + struct vhost_virtqueue *vq1, *vq2; > + /* mq support in future.*/ > + vq1 = dev->virtqueue[index]; > + vq2 = dev->virtqueue[index ^ 1]; > + if (vq1 && vq2 && vq1->desc && vq2->desc && > + (vq1->kickfd > 0) && (vq1->callfd > 0) && > + (vq2->kickfd > 0) && (vq2->callfd > 0)) { > + LOG_DEBUG(VHOST_CONFIG, "virtio is ready for processing.\n"); > + return 1; > + } > + LOG_DEBUG(VHOST_CONFIG, "virtio isn't ready for processing.\n"); > return 0; > } > > @@ -940,7 +669,6 @@ static int > set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) > { > struct virtio_net *dev; > - struct eventfd_copy eventfd_kick; > struct vhost_virtqueue *vq; > > dev = get_device(ctx); > @@ -953,14 +681,7 @@ set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) > if (vq->kickfd) > close((int)vq->kickfd); > > - /* Populate the eventfd_copy structure and call eventfd_copy. */ > - vq->kickfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); > - eventfd_kick.source_fd = vq->kickfd; > - eventfd_kick.target_fd = file->fd; > - eventfd_kick.target_pid = ctx.pid; > - > - if (eventfd_copy(dev, &eventfd_kick)) > - return -1; > + vq->kickfd = file->fd; > > return 0; > } > @@ -974,7 +695,6 @@ static int > set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) > { > struct virtio_net *dev; > - struct eventfd_copy eventfd_call; > struct vhost_virtqueue *vq; > > dev = get_device(ctx); > @@ -986,16 +706,11 @@ set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) > > if (vq->callfd) > close((int)vq->callfd); > + vq->callfd = file->fd; > > - /* Populate the eventfd_copy structure and call eventfd_copy. */ > - vq->callfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); > - eventfd_call.source_fd = vq->callfd; > - eventfd_call.target_fd = file->fd; > - eventfd_call.target_pid = ctx.pid; > - > - if (eventfd_copy(dev, &eventfd_call)) > - return -1; > - > + if (virtio_is_ready(dev, file->index) && > + !(dev->flags & VIRTIO_DEV_RUNNING)) > + notify_ops->new_device(dev); > return 0; > } > > @@ -1024,6 +739,7 @@ set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file) > * If the device isn't already running and both backend fds are set, > * we add the device. > */ > + LOG_DEBUG(VHOST_CONFIG, "%s %d\n", __func__, file->fd); > if (!(dev->flags & VIRTIO_DEV_RUNNING)) { > if (((int)dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED) && > ((int)dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED)) ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [dpdk-dev] [PATCH RFC] lib/librte_vhost: vhost-user 2014-11-17 6:04 ` Tetsuya Mukawa @ 2014-11-17 6:11 ` Tetsuya Mukawa 0 siblings, 0 replies; 6+ messages in thread From: Tetsuya Mukawa @ 2014-11-17 6:11 UTC (permalink / raw) To: Huawei Xie, dev Hi Xie, (2014/11/17 15:04), Tetsuya Mukawa wrote: > Hi Xie, > > > (2014/11/15 10:14), Huawei Xie wrote: >> implement socket server >> fd event dispatch mechanism >> vhost sock message handling >> memory map for each region >> VHOST_USER_SET_VRING_KICK_FD as the indicator that vring is available >> VHOST_USER_GET_VRING_BASE as the message that vring should be released >> >> The message flow between vhost-user and vhost-cuse is kindof different, >> which makes virtio-net common message handler layer difficult and complicated to handle >> both cases in new_device/destroy_device/memory map/resource cleanup. >> >> Will only leave the most common messag handling in virtio-net, and move the >> control logic to cuse/fuse layer. >> >> >> Signed-off-by: Huawei Xie <huawei.xie@intel.com> > Great patch! > I guess we can start from this patch to implement vhost-user and > abstraction layer. > > I've checked patch. > > 1. White space, tab and indent patch. > I will send patch that clears white space, tab and indent. Could you > please check it? > It might be difficult to see the difference, if your editor doesn't show > a space or tab. > > 2. Some files are based on old codes. > At least, following patch is not included. > - vhost: fix build without unused result > Also vhost_rxtx.c isn't probably based on latest code. > > 3. Device abstraction layer code > I will send the device abstraction layer code after this email. > Anyway, I guess we need to decide whether, or not we still keep > vhost-cuse code Additionally, the above patches are based on your RFC patch. Tetsuya > > 4. Multiple devices operation. > For example, when thread1 opens vhost-user device1 and thread2 opens > vhost-user device2, > each thread may want to register own callbacks. > Current implementation may not allow this. > I guess we need to eliminate global variables in librte_vhost as much as > possible. > > Thanks, > Tetsuya > >> --- >> lib/librte_vhost/Makefile | 14 +- >> lib/librte_vhost/eventfd_link/eventfd_link.c | 27 +- >> lib/librte_vhost/eventfd_link/eventfd_link.h | 48 +- >> lib/librte_vhost/libvirt/qemu-wrap.py | 367 --------------- >> lib/librte_vhost/rte_virtio_net.h | 106 ++--- >> lib/librte_vhost/vhost-cuse/vhost-net-cdev.c | 436 ++++++++++++++++++ >> lib/librte_vhost/vhost-cuse/virtio-net-cdev.c | 314 +++++++++++++ >> lib/librte_vhost/vhost-cuse/virtio-net-cdev.h | 43 ++ >> lib/librte_vhost/vhost-net-cdev.c | 389 ---------------- >> lib/librte_vhost/vhost-net-cdev.h | 113 ----- >> lib/librte_vhost/vhost-user/fd_man.c | 158 +++++++ >> lib/librte_vhost/vhost-user/fd_man.h | 31 ++ >> lib/librte_vhost/vhost-user/vhost-net-user.c | 417 +++++++++++++++++ >> lib/librte_vhost/vhost-user/vhost-net-user.h | 74 +++ >> lib/librte_vhost/vhost-user/virtio-net-user.c | 208 +++++++++ >> lib/librte_vhost/vhost-user/virtio-net-user.h | 11 + >> lib/librte_vhost/vhost_rxtx.c | 625 ++++---------------------- >> lib/librte_vhost/virtio-net.c | 450 ++++--------------- >> 18 files changed, 1939 insertions(+), 1892 deletions(-) >> delete mode 100755 lib/librte_vhost/libvirt/qemu-wrap.py >> create mode 100644 lib/librte_vhost/vhost-cuse/vhost-net-cdev.c >> create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.c >> create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.h >> delete mode 100644 lib/librte_vhost/vhost-net-cdev.c >> delete mode 100644 lib/librte_vhost/vhost-net-cdev.h >> create mode 100644 lib/librte_vhost/vhost-user/fd_man.c >> create mode 100644 lib/librte_vhost/vhost-user/fd_man.h >> create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.c >> create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.h >> create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.c >> create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.h >> >> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile >> index c008d64..cb4e172 100644 >> --- a/lib/librte_vhost/Makefile >> +++ b/lib/librte_vhost/Makefile >> @@ -34,17 +34,19 @@ include $(RTE_SDK)/mk/rte.vars.mk >> # library name >> LIB = librte_vhost.a >> >> -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 -lfuse >> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I. -I vhost-user -I vhost-cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse >> LDFLAGS += -lfuse >> # all source are stored in SRCS-y >> -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-net-cdev.c virtio-net.c vhost_rxtx.c >> +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-cuse/vhost-net-cdev.c vhost-cuse/virtio-net-cdev.c >> + >> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-user/fd_man.c vhost-user/vhost-net-user.c vhost-user/virtio-net-user.c >> + >> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net.c vhost_rxtx.c >> >> # install includes >> SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h >> >> -# dependencies >> -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal >> -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_ether >> -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mbuf >> +# this lib needs eal >> +DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal lib/librte_mbuf >> >> include $(RTE_SDK)/mk/rte.lib.mk >> diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.c b/lib/librte_vhost/eventfd_link/eventfd_link.c >> index 7755dd6..4c9b628 100644 >> --- a/lib/librte_vhost/eventfd_link/eventfd_link.c >> +++ b/lib/librte_vhost/eventfd_link/eventfd_link.c >> @@ -13,8 +13,7 @@ >> * General Public License for more details. >> * >> * You should have received a copy of the GNU General Public License >> - * along with this program; if not, write to the Free Software >> - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. >> + * along with this program; If not, see <http://www.gnu.org/licenses/>. >> * The full GNU General Public License is included in this distribution >> * in the file called LICENSE.GPL. >> * >> @@ -78,8 +77,7 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) >> >> switch (ioctl) { >> case EVENTFD_COPY: >> - if (copy_from_user(&eventfd_copy, argp, >> - sizeof(struct eventfd_copy))) >> + if (copy_from_user(&eventfd_copy, argp, sizeof(struct eventfd_copy))) >> return -EFAULT; >> >> /* >> @@ -88,28 +86,28 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) >> task_target = >> pid_task(find_vpid(eventfd_copy.target_pid), PIDTYPE_PID); >> if (task_target == NULL) { >> - pr_debug("Failed to get mem ctx for target pid\n"); >> + printk(KERN_DEBUG "Failed to get mem ctx for target pid\n"); >> return -EFAULT; >> } >> >> files = get_files_struct(current); >> if (files == NULL) { >> - pr_debug("Failed to get files struct\n"); >> + printk(KERN_DEBUG "Failed to get files struct\n"); >> return -EFAULT; >> } >> >> rcu_read_lock(); >> file = fcheck_files(files, eventfd_copy.source_fd); >> if (file) { >> - if (file->f_mode & FMODE_PATH || >> - !atomic_long_inc_not_zero(&file->f_count)) >> + if (file->f_mode & FMODE_PATH >> + || !atomic_long_inc_not_zero(&file->f_count)) >> file = NULL; >> } >> rcu_read_unlock(); >> put_files_struct(files); >> >> if (file == NULL) { >> - pr_debug("Failed to get file from source pid\n"); >> + printk(KERN_DEBUG "Failed to get file from source pid\n"); >> return 0; >> } >> >> @@ -128,25 +126,26 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) >> >> files = get_files_struct(task_target); >> if (files == NULL) { >> - pr_debug("Failed to get files struct\n"); >> + printk(KERN_DEBUG "Failed to get files struct\n"); >> return -EFAULT; >> } >> >> rcu_read_lock(); >> file = fcheck_files(files, eventfd_copy.target_fd); >> if (file) { >> - if (file->f_mode & FMODE_PATH || >> - !atomic_long_inc_not_zero(&file->f_count)) >> - file = NULL; >> + if (file->f_mode & FMODE_PATH >> + || !atomic_long_inc_not_zero(&file->f_count)) >> + file = NULL; >> } >> rcu_read_unlock(); >> put_files_struct(files); >> >> if (file == NULL) { >> - pr_debug("Failed to get file from target pid\n"); >> + printk(KERN_DEBUG "Failed to get file from target pid\n"); >> return 0; >> } >> >> + >> /* >> * Install the file struct from the target process into the >> * file desciptor of the source process, >> diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.h b/lib/librte_vhost/eventfd_link/eventfd_link.h >> index ea619ec..38052e2 100644 >> --- a/lib/librte_vhost/eventfd_link/eventfd_link.h >> +++ b/lib/librte_vhost/eventfd_link/eventfd_link.h >> @@ -1,7 +1,4 @@ >> /*- >> - * This file is provided under a dual BSD/GPLv2 license. When using or >> - * redistributing this file, you may do so under either license. >> - * >> * GPL LICENSE SUMMARY >> * >> * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> @@ -16,61 +13,28 @@ >> * General Public License for more details. >> * >> * You should have received a copy of the GNU General Public License >> - * along with this program; if not, write to the Free Software >> - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. >> + * along with this program; If not, see <http://www.gnu.org/licenses/>. >> * The full GNU General Public License is included in this distribution >> * in the file called LICENSE.GPL. >> * >> * Contact Information: >> * Intel Corporation >> - * >> - * BSD LICENSE >> - * >> - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> - * All rights reserved. >> - * >> - * Redistribution and use in source and binary forms, with or without >> - * modification, are permitted provided that the following conditions >> - * are met: >> - * >> - * Redistributions of source code must retain the above copyright >> - * notice, this list of conditions and the following disclaimer. >> - * Redistributions in binary form must reproduce the above copyright >> - * notice, this list of conditions and the following disclaimer in >> - * the documentation and/or other materials provided with the >> - * distribution. >> - * Neither the name of Intel Corporation nor the names of its >> - * contributors may be used to endorse or promote products derived >> - * from this software without specific prior written permission. >> - * >> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> - * >> */ >> >> #ifndef _EVENTFD_LINK_H_ >> #define _EVENTFD_LINK_H_ >> >> /* >> - * ioctl to copy an fd entry in calling process to an fd in a target process >> + * ioctl to copy an fd entry in calling process to an fd in a target process >> */ >> #define EVENTFD_COPY 1 >> >> /* >> - * arguements for the EVENTFD_COPY ioctl >> + * arguements for the EVENTFD_COPY ioctl >> */ >> struct eventfd_copy { >> - unsigned target_fd; /* fd in the target pid */ >> - unsigned source_fd; /* fd in the calling pid */ >> - pid_t target_pid; /* pid of the target pid */ >> + unsigned target_fd; /**< fd in the target pid */ >> + unsigned source_fd; /**< fd in the calling pid */ >> + pid_t target_pid; /**< pid of the target pid */ >> }; >> #endif /* _EVENTFD_LINK_H_ */ >> diff --git a/lib/librte_vhost/libvirt/qemu-wrap.py b/lib/librte_vhost/libvirt/qemu-wrap.py >> deleted file mode 100755 >> index e2d68a0..0000000 >> --- a/lib/librte_vhost/libvirt/qemu-wrap.py >> +++ /dev/null >> @@ -1,367 +0,0 @@ >> -#!/usr/bin/python >> -#/* >> -# * BSD LICENSE >> -# * >> -# * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> -# * All rights reserved. >> -# * >> -# * Redistribution and use in source and binary forms, with or without >> -# * modification, are permitted provided that the following conditions >> -# * are met: >> -# * >> -# * * Redistributions of source code must retain the above copyright >> -# * notice, this list of conditions and the following disclaimer. >> -# * * Redistributions in binary form must reproduce the above copyright >> -# * notice, this list of conditions and the following disclaimer in >> -# * the documentation and/or other materials provided with the >> -# * distribution. >> -# * * Neither the name of Intel Corporation nor the names of its >> -# * contributors may be used to endorse or promote products derived >> -# * from this software without specific prior written permission. >> -# * >> -# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> -# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> -# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> -# * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> -# * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> -# * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> -# * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> -# * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> -# * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> -# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> -# * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> -# */ >> - >> -##################################################################### >> -# This script is designed to modify the call to the QEMU emulator >> -# to support userspace vhost when starting a guest machine through >> -# libvirt with vhost enabled. The steps to enable this are as follows >> -# and should be run as root: >> -# >> -# 1. Place this script in a libvirtd's binary search PATH ($PATH) >> -# A good location would be in the same directory that the QEMU >> -# binary is located >> -# >> -# 2. Ensure that the script has the same owner/group and file >> -# permissions as the QEMU binary >> -# >> -# 3. Update the VM xml file using "virsh edit VM.xml" >> -# >> -# 3.a) Set the VM to use the launch script >> -# >> -# Set the emulator path contained in the >> -# <emulator><emulator/> tags >> -# >> -# e.g replace <emulator>/usr/bin/qemu-kvm<emulator/> >> -# with <emulator>/usr/bin/qemu-wrap.py<emulator/> >> -# >> -# 3.b) Set the VM's device's to use vhost-net offload >> -# >> -# <interface type="network"> >> -# <model type="virtio"/> >> -# <driver name="vhost"/> >> -# <interface/> >> -# >> -# 4. Enable libvirt to access our userpace device file by adding it to >> -# controllers cgroup for libvirtd using the following steps >> -# >> -# 4.a) In /etc/libvirt/qemu.conf add/edit the following lines: >> -# 1) cgroup_controllers = [ ... "devices", ... ] >> -# 2) clear_emulator_capabilities = 0 >> -# 3) user = "root" >> -# 4) group = "root" >> -# 5) cgroup_device_acl = [ >> -# "/dev/null", "/dev/full", "/dev/zero", >> -# "/dev/random", "/dev/urandom", >> -# "/dev/ptmx", "/dev/kvm", "/dev/kqemu", >> -# "/dev/rtc", "/dev/hpet", "/dev/net/tun", >> -# "/dev/<devbase-name>-<index>", >> -# ] >> -# >> -# 4.b) Disable SELinux or set to permissive mode >> -# >> -# 4.c) Mount cgroup device controller >> -# "mkdir /dev/cgroup" >> -# "mount -t cgroup none /dev/cgroup -o devices" >> -# >> -# 4.d) Set hugetlbfs_mount variable - ( Optional ) >> -# VMs using userspace vhost must use hugepage backed >> -# memory. This can be enabled in the libvirt XML >> -# config by adding a memory backing section to the >> -# XML config e.g. >> -# <memoryBacking> >> -# <hugepages/> >> -# </memoryBacking> >> -# This memory backing section should be added after the >> -# <memory> and <currentMemory> sections. This will add >> -# flags "-mem-prealloc -mem-path <path>" to the QEMU >> -# command line. The hugetlbfs_mount variable can be used >> -# to override the default <path> passed through by libvirt. >> -# >> -# if "-mem-prealloc" or "-mem-path <path>" are not passed >> -# through and a vhost device is detected then these options will >> -# be automatically added by this script. This script will detect >> -# the system hugetlbfs mount point to be used for <path>. The >> -# default <path> for this script can be overidden by the >> -# hugetlbfs_dir variable in the configuration section of this script. >> -# >> -# >> -# 4.e) Restart the libvirtd system process >> -# e.g. on Fedora "systemctl restart libvirtd.service" >> -# >> -# >> -# 4.f) Edit the Configuration Parameters section of this script >> -# to point to the correct emulator location and set any >> -# addition options >> -# >> -# The script modifies the libvirtd Qemu call by modifying/adding >> -# options based on the configuration parameters below. >> -# NOTE: >> -# emul_path and us_vhost_path must be set >> -# All other parameters are optional >> -##################################################################### >> - >> - >> -############################################# >> -# Configuration Parameters >> -############################################# >> -#Path to QEMU binary >> -emul_path = "/usr/local/bin/qemu-system-x86_64" >> - >> -#Path to userspace vhost device file >> -# This filename should match the --dev-basename --dev-index parameters of >> -# the command used to launch the userspace vhost sample application e.g. >> -# if the sample app lauch command is: >> -# ./build/vhost-switch ..... --dev-basename usvhost --dev-index 1 >> -# then this variable should be set to: >> -# us_vhost_path = "/dev/usvhost-1" >> -us_vhost_path = "/dev/usvhost-1" >> - >> -#List of additional user defined emulation options. These options will >> -#be added to all Qemu calls >> -emul_opts_user = [] >> - >> -#List of additional user defined emulation options for vhost only. >> -#These options will only be added to vhost enabled guests >> -emul_opts_user_vhost = [] >> - >> -#For all VHOST enabled VMs, the VM memory is preallocated from hugetlbfs >> -# Set this variable to one to enable this option for all VMs >> -use_huge_all = 0 >> - >> -#Instead of autodetecting, override the hugetlbfs directory by setting >> -#this variable >> -hugetlbfs_dir = "" >> - >> -############################################# >> - >> - >> -############################################# >> -# ****** Do Not Modify Below this Line ****** >> -############################################# >> - >> -import sys, os, subprocess >> - >> - >> -#List of open userspace vhost file descriptors >> -fd_list = [] >> - >> -#additional virtio device flags when using userspace vhost >> -vhost_flags = [ "csum=off", >> - "gso=off", >> - "guest_tso4=off", >> - "guest_tso6=off", >> - "guest_ecn=off" >> - ] >> - >> - >> -############################################# >> -# Find the system hugefile mount point. >> -# Note: >> -# if multiple hugetlbfs mount points exist >> -# then the first one found will be used >> -############################################# >> -def find_huge_mount(): >> - >> - if (len(hugetlbfs_dir)): >> - return hugetlbfs_dir >> - >> - huge_mount = "" >> - >> - if (os.access("/proc/mounts", os.F_OK)): >> - f = open("/proc/mounts", "r") >> - line = f.readline() >> - while line: >> - line_split = line.split(" ") >> - if line_split[2] == 'hugetlbfs': >> - huge_mount = line_split[1] >> - break >> - line = f.readline() >> - else: >> - print "/proc/mounts not found" >> - exit (1) >> - >> - f.close >> - if len(huge_mount) == 0: >> - print "Failed to find hugetlbfs mount point" >> - exit (1) >> - >> - return huge_mount >> - >> - >> -############################################# >> -# Get a userspace Vhost file descriptor >> -############################################# >> -def get_vhost_fd(): >> - >> - if (os.access(us_vhost_path, os.F_OK)): >> - fd = os.open( us_vhost_path, os.O_RDWR) >> - else: >> - print ("US-Vhost file %s not found" %us_vhost_path) >> - exit (1) >> - >> - return fd >> - >> - >> -############################################# >> -# Check for vhostfd. if found then replace >> -# with our own vhost fd and append any vhost >> -# flags onto the end >> -############################################# >> -def modify_netdev_arg(arg): >> - >> - global fd_list >> - vhost_in_use = 0 >> - s = '' >> - new_opts = [] >> - netdev_opts = arg.split(",") >> - >> - for opt in netdev_opts: >> - #check if vhost is used >> - if "vhost" == opt[:5]: >> - vhost_in_use = 1 >> - else: >> - new_opts.append(opt) >> - >> - #if using vhost append vhost options >> - if vhost_in_use == 1: >> - #append vhost on option >> - new_opts.append('vhost=on') >> - #append vhostfd ption >> - new_fd = get_vhost_fd() >> - new_opts.append('vhostfd=' + str(new_fd)) >> - fd_list.append(new_fd) >> - >> - #concatenate all options >> - for opt in new_opts: >> - if len(s) > 0: >> - s+=',' >> - >> - s+=opt >> - >> - return s >> - >> - >> -############################################# >> -# Main >> -############################################# >> -def main(): >> - >> - global fd_list >> - global vhost_in_use >> - new_args = [] >> - num_cmd_args = len(sys.argv) >> - emul_call = '' >> - mem_prealloc_set = 0 >> - mem_path_set = 0 >> - num = 0; >> - >> - #parse the parameters >> - while (num < num_cmd_args): >> - arg = sys.argv[num] >> - >> - #Check netdev +1 parameter for vhostfd >> - if arg == '-netdev': >> - num_vhost_devs = len(fd_list) >> - new_args.append(arg) >> - >> - num+=1 >> - arg = sys.argv[num] >> - mod_arg = modify_netdev_arg(arg) >> - new_args.append(mod_arg) >> - >> - #append vhost flags if this is a vhost device >> - # and -device is the next arg >> - # i.e -device -opt1,-opt2,...,-opt3,%vhost >> - if (num_vhost_devs < len(fd_list)): >> - num+=1 >> - arg = sys.argv[num] >> - if arg == '-device': >> - new_args.append(arg) >> - num+=1 >> - new_arg = sys.argv[num] >> - for flag in vhost_flags: >> - new_arg = ''.join([new_arg,',',flag]) >> - new_args.append(new_arg) >> - else: >> - new_args.append(arg) >> - elif arg == '-mem-prealloc': >> - mem_prealloc_set = 1 >> - new_args.append(arg) >> - elif arg == '-mem-path': >> - mem_path_set = 1 >> - new_args.append(arg) >> - >> - else: >> - new_args.append(arg) >> - >> - num+=1 >> - >> - #Set Qemu binary location >> - emul_call+=emul_path >> - emul_call+=" " >> - >> - #Add prealloc mem options if using vhost and not already added >> - if ((len(fd_list) > 0) and (mem_prealloc_set == 0)): >> - emul_call += "-mem-prealloc " >> - >> - #Add mempath mem options if using vhost and not already added >> - if ((len(fd_list) > 0) and (mem_path_set == 0)): >> - #Detect and add hugetlbfs mount point >> - mp = find_huge_mount() >> - mp = "".join(["-mem-path ", mp]) >> - emul_call += mp >> - emul_call += " " >> - >> - >> - #add user options >> - for opt in emul_opts_user: >> - emul_call += opt >> - emul_call += " " >> - >> - #Add add user vhost only options >> - if len(fd_list) > 0: >> - for opt in emul_opts_user_vhost: >> - emul_call += opt >> - emul_call += " " >> - >> - #Add updated libvirt options >> - iter_args = iter(new_args) >> - #skip 1st arg i.e. call to this script >> - next(iter_args) >> - for arg in iter_args: >> - emul_call+=str(arg) >> - emul_call+= " " >> - >> - #Call QEMU >> - subprocess.call(emul_call, shell=True) >> - >> - >> - #Close usvhost files >> - for fd in fd_list: >> - os.close(fd) >> - >> - >> -if __name__ == "__main__": >> - main() >> - >> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h >> index 00b1328..7a05dab 100644 >> --- a/lib/librte_vhost/rte_virtio_net.h >> +++ b/lib/librte_vhost/rte_virtio_net.h >> @@ -34,11 +34,6 @@ >> #ifndef _VIRTIO_NET_H_ >> #define _VIRTIO_NET_H_ >> >> -/** >> - * @file >> - * Interface to vhost net >> - */ >> - >> #include <stdint.h> >> #include <linux/virtio_ring.h> >> #include <linux/virtio_net.h> >> @@ -48,66 +43,38 @@ >> #include <rte_mempool.h> >> #include <rte_mbuf.h> >> >> -/* Used to indicate that the device is running on a data core */ >> -#define VIRTIO_DEV_RUNNING 1 >> - >> -/* Backend value set by guest. */ >> -#define VIRTIO_DEV_STOPPED -1 >> - >> +#define VIRTIO_DEV_RUNNING 1 /**< Used to indicate that the device is running on a data core. */ >> +#define VIRTIO_DEV_STOPPED -1 /**< Backend value set by guest. */ >> >> /* Enum for virtqueue management. */ >> enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; >> >> -#define BUF_VECTOR_MAX 256 >> - >> -/** >> - * Structure contains buffer address, length and descriptor index >> - * from vring to do scatter RX. >> - */ >> -struct buf_vector { >> - uint64_t buf_addr; >> - uint32_t buf_len; >> - uint32_t desc_idx; >> -}; >> - >> /** >> * Structure contains variables relevant to RX/TX virtqueues. >> */ >> struct vhost_virtqueue { >> - struct vring_desc *desc; /**< Virtqueue descriptor ring. */ >> - struct vring_avail *avail; /**< Virtqueue available ring. */ >> - struct vring_used *used; /**< Virtqueue used ring. */ >> - uint32_t size; /**< Size of descriptor ring. */ >> - uint32_t backend; /**< Backend value to determine if device should started/stopped. */ >> - uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ >> - volatile uint16_t last_used_idx; /**< Last index used on the available ring */ >> - volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ >> - eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ >> - eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ >> - struct buf_vector buf_vec[BUF_VECTOR_MAX]; /**< for scatter RX. */ >> -} __rte_cache_aligned; >> - >> -/** >> - * Device structure contains all configuration information relating to the device. >> - */ >> -struct virtio_net { >> - struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ >> - struct virtio_memory *mem; /**< QEMU memory and memory region information. */ >> - uint64_t features; /**< Negotiated feature set. */ >> - uint64_t device_fh; /**< device identifier. */ >> - uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ >> - void *priv; /**< private context */ >> + struct vring_desc *desc; /**< descriptor ring. */ >> + struct vring_avail *avail; /**< available ring. */ >> + struct vring_used *used; /**< used ring. */ >> + uint32_t size; /**< Size of descriptor ring. */ >> + uint32_t backend; /**< Backend value to determine if device should be started/stopped. */ >> + uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ >> + volatile uint16_t last_used_idx; /**< Last index used on the available ring. */ >> + volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ >> + eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ >> + eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ >> } __rte_cache_aligned; >> >> /** >> - * Information relating to memory regions including offsets to addresses in QEMUs memory file. >> + * Information relating to memory regions including offsets to >> + * addresses in QEMUs memory file. >> */ >> struct virtio_memory_regions { >> - uint64_t guest_phys_address; /**< Base guest physical address of region. */ >> - uint64_t guest_phys_address_end; /**< End guest physical address of region. */ >> - uint64_t memory_size; /**< Size of region. */ >> - uint64_t userspace_address; /**< Base userspace address of region. */ >> - uint64_t address_offset; /**< Offset of region for address translation. */ >> + uint64_t guest_phys_address; /**< Base guest physical address of region. */ >> + uint64_t guest_phys_address_end; /**< End guest physical address of region. */ >> + uint64_t memory_size; /**< Size of region. */ >> + uint64_t userspace_address; /**< Base userspace address of region. */ >> + uint64_t address_offset; /**< Offset of region for address translation. */ >> }; >> >> >> @@ -115,21 +82,34 @@ struct virtio_memory_regions { >> * Memory structure includes region and mapping information. >> */ >> struct virtio_memory { >> - uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ >> - uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ >> - uint64_t mapped_size; /**< Total size of memory file. */ >> - uint32_t nregions; /**< Number of memory regions. */ >> + uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ >> + uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ >> + uint64_t mapped_size; /**< Total size of memory file. */ >> + uint32_t nregions; /**< Number of memory regions. */ >> struct virtio_memory_regions regions[0]; /**< Memory region information. */ >> }; >> >> /** >> + * Device structure contains all configuration information relating to the device. >> + */ >> +struct virtio_net { >> + struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ >> + struct virtio_memory *mem; /**< QEMU memory and memory region information. */ >> + uint64_t features; /**< Negotiated feature set. */ >> + uint64_t device_fh; /**< Device identifier. */ >> + uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ >> + void *priv; >> +} __rte_cache_aligned; >> + >> +/** >> * Device operations to add/remove device. >> */ >> struct virtio_net_device_ops { >> - int (*new_device)(struct virtio_net *); /**< Add device. */ >> - void (*destroy_device)(volatile struct virtio_net *); /**< Remove device. */ >> + int (*new_device)(struct virtio_net *); /**< Add device. */ >> + void (*destroy_device)(struct virtio_net *); /**< Remove device. */ >> }; >> >> + >> static inline uint16_t __attribute__((always_inline)) >> rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id) >> { >> @@ -179,7 +159,7 @@ int rte_vhost_driver_register(const char *dev_name); >> >> /* Register callbacks. */ >> int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); >> -/* Start vhost driver session blocking loop. */ >> + >> int rte_vhost_driver_session_start(void); >> >> /** >> @@ -192,8 +172,8 @@ int rte_vhost_driver_session_start(void); >> * @return >> * num of packets enqueued >> */ >> -uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, >> - struct rte_mbuf **pkts, uint16_t count); >> +uint32_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, >> + struct rte_mbuf **pkts, uint32_t count); >> >> /** >> * This function gets guest buffers from the virtio device TX virtqueue, >> @@ -206,7 +186,7 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, >> * @return >> * num of packets dequeued >> */ >> -uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, >> - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); >> +uint32_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, >> + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count); >> >> #endif /* _VIRTIO_NET_H_ */ >> diff --git a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c >> new file mode 100644 >> index 0000000..4671643 >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c >> @@ -0,0 +1,436 @@ >> +/*- >> + * BSD LICENSE >> + * >> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> + * All rights reserved. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * >> + * * Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * * Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in >> + * the documentation and/or other materials provided with the >> + * distribution. >> + * * Neither the name of Intel Corporation nor the names of its >> + * contributors may be used to endorse or promote products derived >> + * from this software without specific prior written permission. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> + */ >> + >> +#include <stdint.h> >> +#include <fuse/cuse_lowlevel.h> >> +#include <linux/limits.h> >> +#include <linux/vhost.h> >> +#include <linux/virtio_net.h> >> +#include <string.h> >> +#include <unistd.h> >> +#include <sys/ioctl.h> >> + >> +#include <rte_ethdev.h> >> +#include <rte_log.h> >> +#include <rte_string_fns.h> >> +#include <rte_virtio_net.h> >> + >> +#include "virtio-net-cdev.h" >> +#include "vhost-net.h" >> +#include "eventfd_link/eventfd_link.h" >> + >> +#define FUSE_OPT_DUMMY "\0\0" >> +#define FUSE_OPT_FORE "-f\0\0" >> +#define FUSE_OPT_NOMULTI "-s\0\0" >> + >> +static const uint32_t default_major = 231; >> +static const uint32_t default_minor = 1; >> +static const char cuse_device_name[] = "/dev/cuse"; >> +static const char default_cdev[] = "vhost-net"; >> +static const char eventfd_cdev[] = "/dev/eventfd-link"; >> + >> +static struct fuse_session *session; >> +const struct vhost_net_device_ops const *ops; >> + >> +/* >> + * Returns vhost_device_ctx from given fuse_req_t. The index is populated later >> + * when the device is added to the device linked list. >> + */ >> +static struct vhost_device_ctx >> +fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) >> +{ >> + struct vhost_device_ctx ctx; >> + struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); >> + >> + ctx.pid = req_ctx->pid; >> + ctx.fh = fi->fh; >> + >> + return ctx; >> +} >> + >> +/* >> + * When the device is created in QEMU it gets initialised here and >> + * added to the device linked list. >> + */ >> +static void >> +vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) >> +{ >> + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); >> + int err = 0; >> + >> + err = ops->new_device(ctx); >> + if (err == -1) { >> + fuse_reply_err(req, EPERM); >> + return; >> + } >> + >> + fi->fh = err; >> + >> + RTE_LOG(INFO, VHOST_CONFIG, >> + "(%"PRIu64") Device configuration started\n", fi->fh); >> + fuse_reply_open(req, fi); >> +} >> + >> +/* >> + * When QEMU is shutdown or killed the device gets released. >> + */ >> +static void >> +vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) >> +{ >> + int err = 0; >> + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); >> + >> + ops->destroy_device(ctx); >> + RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); >> + fuse_reply_err(req, err); >> +} >> + >> +/* >> + * Boilerplate code for CUSE IOCTL >> + * Implicit arguments: ctx, req, result. >> + */ >> +#define VHOST_IOCTL(func) do { \ >> + result = (func)(ctx); \ >> + fuse_reply_ioctl(req, result, NULL, 0); \ >> +} while (0) >> + >> +/* >> + * Boilerplate IOCTL RETRY >> + * Implicit arguments: req. >> + */ >> +#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ >> + struct iovec iov_r = { arg, (size_r) }; \ >> + struct iovec iov_w = { arg, (size_w) }; \ >> + fuse_reply_ioctl_retry(req, &iov_r, \ >> + (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ >> +} while (0) >> + >> +/* >> + * Boilerplate code for CUSE Read IOCTL >> + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. >> + */ >> +#define VHOST_IOCTL_R(type, var, func) do { \ >> + if (!in_bufsz) { \ >> + VHOST_IOCTL_RETRY(sizeof(type), 0);\ >> + } else { \ >> + (var) = *(const type*)in_buf; \ >> + result = func(ctx, &(var)); \ >> + fuse_reply_ioctl(req, result, NULL, 0);\ >> + } \ >> +} while (0) >> + >> +/* >> + * Boilerplate code for CUSE Write IOCTL >> + * Implicit arguments: ctx, req, result, out_bufsz. >> + */ >> +#define VHOST_IOCTL_W(type, var, func) do { \ >> + if (!out_bufsz) { \ >> + VHOST_IOCTL_RETRY(0, sizeof(type));\ >> + } else { \ >> + result = (func)(ctx, &(var));\ >> + fuse_reply_ioctl(req, result, &(var), sizeof(type));\ >> + } \ >> +} while (0) >> + >> +/* >> + * Boilerplate code for CUSE Read/Write IOCTL >> + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. >> + */ >> +#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ >> + if (!in_bufsz) { \ >> + VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ >> + } else { \ >> + (var1) = *(const type1*) (in_buf); \ >> + result = (func)(ctx, (var1), &(var2)); \ >> + fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ >> + } \ >> +} while (0) >> + >> +/* >> + * This function uses the eventfd_link kernel module to copy an eventfd file >> + * descriptor provided by QEMU in to our process space. >> + */ >> +static int >> +eventfd_copy(int target_fd, int target_pid) >> +{ >> + int eventfd_link, ret; >> + struct eventfd_copy eventfd_copy; >> + int fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); >> + >> + if (fd == -1) >> + return -1; >> + >> + /* Open the character device to the kernel module. */ >> + /* TODO: check this earlier rather than fail until VM boots! */ >> + eventfd_link = open(eventfd_cdev, O_RDWR); >> + if (eventfd_link < 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "eventfd_link module is not loaded\n"); >> + return -1; >> + } >> + >> + eventfd_copy.source_fd = fd; >> + eventfd_copy.target_fd = target_fd; >> + eventfd_copy.target_pid = target_pid; >> + /* Call the IOCTL to copy the eventfd. */ >> + ret = ioctl(eventfd_link, EVENTFD_COPY, &eventfd_copy); >> + close(eventfd_link); >> + >> + if (ret < 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "EVENTFD_COPY ioctl failed\n"); >> + return -1; >> + } >> + >> + return fd; >> +} >> + >> +/* >> + * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on >> + * the type of IOCTL a buffer is requested to read or to write. This >> + * request is handled by FUSE and the buffer is then given to CUSE. >> + */ >> +static void >> +vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, >> + struct fuse_file_info *fi, __rte_unused unsigned flags, >> + const void *in_buf, size_t in_bufsz, size_t out_bufsz) >> +{ >> + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); >> + struct vhost_vring_file file; >> + struct vhost_vring_state state; >> + struct vhost_vring_addr addr; >> + uint64_t features; >> + uint32_t index; >> + int result = 0; >> + >> + switch (cmd) { >> + case VHOST_NET_SET_BACKEND: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); >> + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); >> + break; >> + >> + case VHOST_GET_FEATURES: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); >> + VHOST_IOCTL_W(uint64_t, features, ops->get_features); >> + break; >> + >> + case VHOST_SET_FEATURES: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); >> + VHOST_IOCTL_R(uint64_t, features, ops->set_features); >> + break; >> + >> + case VHOST_RESET_OWNER: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); >> + VHOST_IOCTL(ops->reset_owner); >> + break; >> + >> + case VHOST_SET_OWNER: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); >> + VHOST_IOCTL(ops->set_owner); >> + break; >> + >> + case VHOST_SET_MEM_TABLE: >> + /*TODO fix race condition.*/ >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); >> + static struct vhost_memory mem_temp; >> + switch (in_bufsz) { >> + case 0: >> + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); >> + break; >> + >> + case sizeof(struct vhost_memory): >> + mem_temp = *(const struct vhost_memory *) in_buf; >> + >> + if (mem_temp.nregions > 0) { >> + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + >> + (sizeof(struct vhost_memory_region) * >> + mem_temp.nregions), 0); >> + } else { >> + result = -1; >> + fuse_reply_ioctl(req, result, NULL, 0); >> + } >> + break; >> + >> + default: >> + result = cuse_set_mem_table(ctx, in_buf, >> + mem_temp.nregions); >> + if (result) >> + fuse_reply_err(req, EINVAL); >> + else >> + fuse_reply_ioctl(req, result, NULL, 0); >> + } >> + break; >> + >> + case VHOST_SET_VRING_NUM: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); >> + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_num); >> + break; >> + >> + case VHOST_SET_VRING_BASE: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); >> + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_base); >> + break; >> + >> + case VHOST_GET_VRING_BASE: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); >> + VHOST_IOCTL_RW(uint32_t, index, >> + struct vhost_vring_state, state, ops->get_vring_base); >> + break; >> + >> + case VHOST_SET_VRING_ADDR: >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); >> + VHOST_IOCTL_R(struct vhost_vring_addr, addr, ops->set_vring_addr); >> + break; >> + >> + case VHOST_SET_VRING_KICK: >> + case VHOST_SET_VRING_CALL: >> + if (!in_buf) { >> + VHOST_IOCTL_RETRY(sizeof(struct vhost_vring_file), 0); >> + } else { >> + int fd; >> + file = *(const struct vhost_vring_file *)in_buf; >> + LOG_DEBUG(VHOST_CONFIG, >> + "kick/call idx:%d fd:%d\n", file.index, file.fd); >> + if ((fd = eventfd_copy(file.fd, ctx.pid)) < 0){ >> + fuse_reply_ioctl(req, -1, NULL, 0); >> + } >> + file.fd = fd; >> + if (cmd == VHOST_SET_VRING_KICK) { >> + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_call); >> + } >> + else { >> + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_kick); >> + } >> + } >> + break; >> + >> + default: >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); >> + result = -1; >> + fuse_reply_ioctl(req, result, NULL, 0); >> + } >> + >> + if (result < 0) >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); >> + else >> + LOG_DEBUG(VHOST_CONFIG, >> + "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); >> +} >> + >> +/* >> + * Structure handling open, release and ioctl function pointers is populated. >> + */ >> +static const struct cuse_lowlevel_ops vhost_net_ops = { >> + .open = vhost_net_open, >> + .release = vhost_net_release, >> + .ioctl = vhost_net_ioctl, >> +}; >> + >> +/* >> + * cuse_info is populated and used to register the cuse device. >> + * vhost_net_device_ops are also passed when the device is registered in app. >> + */ >> +int >> +rte_vhost_driver_register(const char *dev_name) >> +{ >> + struct cuse_info cuse_info; >> + char device_name[PATH_MAX] = ""; >> + char char_device_name[PATH_MAX] = ""; >> + const char *device_argv[] = { device_name }; >> + >> + char fuse_opt_dummy[] = FUSE_OPT_DUMMY; >> + char fuse_opt_fore[] = FUSE_OPT_FORE; >> + char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; >> + char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; >> + >> + if (access(cuse_device_name, R_OK | W_OK) < 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "char device %s can't be accessed, maybe not exist\n", >> + cuse_device_name); >> + return -1; >> + } >> + >> + /* >> + * The device name is created. This is passed to QEMU so that it can >> + * register the device with our application. >> + */ >> + snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); >> + snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); >> + >> + /* Check if device already exists. */ >> + if (access(char_device_name, F_OK) != -1) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "char device %s already exists\n", char_device_name); >> + return -1; >> + } >> + >> + memset(&cuse_info, 0, sizeof(cuse_info)); >> + cuse_info.dev_major = default_major; >> + cuse_info.dev_minor = default_minor; >> + cuse_info.dev_info_argc = 1; >> + cuse_info.dev_info_argv = device_argv; >> + cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; >> + >> + ops = get_virtio_net_callbacks(); >> + >> + session = cuse_lowlevel_setup(3, fuse_argv, >> + &cuse_info, &vhost_net_ops, 0, NULL); >> + if (session == NULL) >> + return -1; >> + >> + return 0; >> +} >> + >> +/** >> + * The CUSE session is launched allowing the application to receive open, >> + * release and ioctl calls. >> + */ >> +int >> +rte_vhost_driver_session_start(void) >> +{ >> + fuse_session_loop(session); >> + >> + return 0; >> +} >> diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c >> new file mode 100644 >> index 0000000..5c16aa5 >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c >> @@ -0,0 +1,314 @@ >> +/*- >> + * BSD LICENSE >> + * >> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> + * All rights reserved. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * >> + * * Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * * Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in >> + * the documentation and/or other materials provided with the >> + * distribution. >> + * * Neither the name of Intel Corporation nor the names of its >> + * contributors may be used to endorse or promote products derived >> + * from this software without specific prior written permission. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> + */ >> + >> +#include <stdint.h> >> +#include <dirent.h> >> +#include <linux/vhost.h> >> +#include <linux/virtio_net.h> >> +#include <fuse/cuse_lowlevel.h> >> +#include <stddef.h> >> +#include <string.h> >> +#include <stdlib.h> >> +#include <sys/eventfd.h> >> +#include <sys/mman.h> >> +#include <sys/types.h> >> +#include <unistd.h> >> +#include <errno.h> >> + >> +#include <rte_log.h> >> + >> +#include "vhost-net.h" >> +#include "virtio-net-cdev.h" >> + >> +extern struct vhost_net_device_ops const *ops; >> + >> +/* Line size for reading maps file. */ >> +static const uint32_t BUFSIZE = PATH_MAX; >> + >> +/* Size of prot char array in procmap. */ >> +#define PROT_SZ 5 >> + >> +/* Number of elements in procmap struct. */ >> +#define PROCMAP_SZ 8 >> + >> +/* Structure containing information gathered from maps file. */ >> +struct procmap { >> + uint64_t va_start; /* Start virtual address in file. */ >> + uint64_t len; /* Size of file. */ >> + uint64_t pgoff; /* Not used. */ >> + uint32_t maj; /* Not used. */ >> + uint32_t min; /* Not used. */ >> + uint32_t ino; /* Not used. */ >> + char prot[PROT_SZ]; /* Not used. */ >> + char fname[PATH_MAX]; /* File name. */ >> +}; >> + >> +/* >> + * Locate the file containing QEMU's memory space and >> + * map it to our address space. >> + */ >> +static int >> +host_memory_map(pid_t pid, uint64_t addr, >> + uint64_t *mapped_address, uint64_t *mapped_size) >> +{ >> + struct dirent *dptr = NULL; >> + struct procmap procmap; >> + DIR *dp = NULL; >> + int fd; >> + int i; >> + char memfile[PATH_MAX]; >> + char mapfile[PATH_MAX]; >> + char procdir[PATH_MAX]; >> + char resolved_path[PATH_MAX]; >> + FILE *fmap; >> + void *map; >> + uint8_t found = 0; >> + char line[BUFSIZE]; >> + char dlm[] = "- : "; >> + char *str, *sp, *in[PROCMAP_SZ]; >> + char *end = NULL; >> + >> + /* Path where mem files are located. */ >> + snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); >> + /* Maps file used to locate mem file. */ >> + snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); >> + >> + fmap = fopen(mapfile, "r"); >> + if (fmap == NULL) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Failed to open maps file for pid %d\n", pid); >> + return -1; >> + } >> + >> + /* Read through maps file until we find out base_address. */ >> + while (fgets(line, BUFSIZE, fmap) != 0) { >> + str = line; >> + errno = 0; >> + /* Split line in to fields. */ >> + for (i = 0; i < PROCMAP_SZ; i++) { >> + in[i] = strtok_r(str, &dlm[i], &sp); >> + if ((in[i] == NULL) || (errno != 0)) { >> + fclose(fmap); >> + return -1; >> + } >> + str = NULL; >> + } >> + >> + /* Convert/Copy each field as needed. */ >> + procmap.va_start = strtoull(in[0], &end, 16); >> + if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || >> + (errno != 0)) { >> + fclose(fmap); >> + return -1; >> + } >> + >> + procmap.len = strtoull(in[1], &end, 16); >> + if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || >> + (errno != 0)) { >> + fclose(fmap); >> + return -1; >> + } >> + >> + procmap.pgoff = strtoull(in[3], &end, 16); >> + if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || >> + (errno != 0)) { >> + fclose(fmap); >> + return -1; >> + } >> + >> + procmap.maj = strtoul(in[4], &end, 16); >> + if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || >> + (errno != 0)) { >> + fclose(fmap); >> + return -1; >> + } >> + >> + procmap.min = strtoul(in[5], &end, 16); >> + if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || >> + (errno != 0)) { >> + fclose(fmap); >> + return -1; >> + } >> + >> + procmap.ino = strtoul(in[6], &end, 16); >> + if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || >> + (errno != 0)) { >> + fclose(fmap); >> + return -1; >> + } >> + >> + memcpy(&procmap.prot, in[2], PROT_SZ); >> + memcpy(&procmap.fname, in[7], PATH_MAX); >> + >> + if (procmap.va_start == addr) { >> + procmap.len = procmap.len - procmap.va_start; >> + found = 1; >> + break; >> + } >> + } >> + fclose(fmap); >> + >> + if (!found) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Failed to find memory file in pid %d maps file\n", pid); >> + return -1; >> + } >> + >> + /* Find the guest memory file among the process fds. */ >> + dp = opendir(procdir); >> + if (dp == NULL) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Cannot open pid %d process directory\n", >> + pid); >> + return -1; >> + >> + } >> + >> + found = 0; >> + >> + /* Read the fd directory contents. */ >> + while (NULL != (dptr = readdir(dp))) { >> + snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", >> + pid, dptr->d_name); >> + realpath(memfile, resolved_path); >> + if (resolved_path == NULL) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Failed to resolve fd directory\n"); >> + closedir(dp); >> + return -1; >> + } >> + if (strncmp(resolved_path, procmap.fname, >> + strnlen(procmap.fname, PATH_MAX)) == 0) { >> + found = 1; >> + break; >> + } >> + } >> + >> + closedir(dp); >> + >> + if (found == 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Failed to find memory file for pid %d\n", >> + pid); >> + return -1; >> + } >> + /* Open the shared memory file and map the memory into this process. */ >> + fd = open(memfile, O_RDWR); >> + >> + if (fd == -1) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Failed to open %s for pid %d\n", >> + memfile, pid); >> + return -1; >> + } >> + >> + map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE , >> + MAP_POPULATE|MAP_SHARED, fd, 0); >> + close(fd); >> + >> + if (map == MAP_FAILED) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Error mapping the file %s for pid %d\n", >> + memfile, pid); >> + return -1; >> + } >> + >> + /* Store the memory address and size in the device data structure */ >> + *mapped_address = (uint64_t)(uintptr_t)map; >> + *mapped_size = procmap.len; >> + >> + LOG_DEBUG(VHOST_CONFIG, >> + "Mem File: %s->%s - Size: %llu - VA: %p\n", >> + memfile, resolved_path, >> + (unsigned long long)mapped_size, map); >> + >> + return 0; >> +} >> + >> +int >> +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, >> + uint32_t nregions) >> +{ >> + uint64_t size = offsetof(struct vhost_memory, regions); >> + uint32_t idx; >> + struct virtio_memory_regions regions[8]; /* VHOST_MAX_MEMORY_REGIONS */ >> + struct vhost_memory_region *mem_regions = (void *)(uintptr_t) >> + ((uint64_t)(uintptr_t)mem_regions_addr + size); >> + uint64_t base_address = 0, mapped_address, mapped_size; >> + >> + for (idx = 0; idx < nregions; idx++) { >> + regions[idx].guest_phys_address = >> + mem_regions[idx].guest_phys_addr; >> + regions[idx].guest_phys_address_end = >> + regions[idx].guest_phys_address + >> + mem_regions[idx].memory_size; >> + regions[idx].memory_size = >> + mem_regions[idx].memory_size; >> + regions[idx].userspace_address = >> + mem_regions[idx].userspace_addr; >> + >> + LOG_DEBUG(VHOST_CONFIG, "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", >> + idx, >> + (void *)(uintptr_t)regions[idx].guest_phys_address, >> + (void *)(uintptr_t)regions[idx].userspace_address, >> + regions[idx].memory_size); >> + >> + /*set the base address mapping*/ >> + if (regions[idx].guest_phys_address == 0x0) { >> + base_address = >> + regions[idx].userspace_address; >> + /* Map VM memory file */ >> + if (host_memory_map(ctx.pid, base_address, >> + &mapped_address, &mapped_size) != 0) { >> + return -1; >> + } >> + } >> + } >> + >> + /* Check that we have a valid base address. */ >> + if (base_address == 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "Failed to find base address of qemu memory file.\n"); >> + return -1; >> + } >> + >> + for (idx = 0; idx < nregions; idx++) { >> + regions[idx].address_offset = >> + mapped_address - base_address + >> + regions[idx].userspace_address - >> + regions[idx].guest_phys_address; >> + } >> + >> + ops->set_mem_table(ctx, ®ions[0], nregions); >> + return 0; >> +} >> diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h >> new file mode 100644 >> index 0000000..6f98ce8 >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h >> @@ -0,0 +1,43 @@ >> +/*- >> + * BSD LICENSE >> + * >> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> + * All rights reserved. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * >> + * * Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * * Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in >> + * the documentation and/or other materials provided with the >> + * distribution. >> + * * Neither the name of Intel Corporation nor the names of its >> + * contributors may be used to endorse or promote products derived >> + * from this software without specific prior written permission. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> + */ >> +#ifndef _VIRTIO_NET_CDEV_H >> +#define _VIRTIO_NET_CDEV_H >> +#include <stdint.h> >> + >> +#include "vhost-net.h" >> + >> +int >> +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, >> + uint32_t nregions); >> + >> +#endif >> diff --git a/lib/librte_vhost/vhost-net-cdev.c b/lib/librte_vhost/vhost-net-cdev.c >> deleted file mode 100644 >> index 57c76cb..0000000 >> --- a/lib/librte_vhost/vhost-net-cdev.c >> +++ /dev/null >> @@ -1,389 +0,0 @@ >> -/*- >> - * BSD LICENSE >> - * >> - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> - * All rights reserved. >> - * >> - * Redistribution and use in source and binary forms, with or without >> - * modification, are permitted provided that the following conditions >> - * are met: >> - * >> - * * Redistributions of source code must retain the above copyright >> - * notice, this list of conditions and the following disclaimer. >> - * * Redistributions in binary form must reproduce the above copyright >> - * notice, this list of conditions and the following disclaimer in >> - * the documentation and/or other materials provided with the >> - * distribution. >> - * * Neither the name of Intel Corporation nor the names of its >> - * contributors may be used to endorse or promote products derived >> - * from this software without specific prior written permission. >> - * >> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> - */ >> - >> -#include <errno.h> >> -#include <fuse/cuse_lowlevel.h> >> -#include <linux/limits.h> >> -#include <linux/vhost.h> >> -#include <stdint.h> >> -#include <string.h> >> -#include <unistd.h> >> - >> -#include <rte_ethdev.h> >> -#include <rte_log.h> >> -#include <rte_string_fns.h> >> -#include <rte_virtio_net.h> >> - >> -#include "vhost-net-cdev.h" >> - >> -#define FUSE_OPT_DUMMY "\0\0" >> -#define FUSE_OPT_FORE "-f\0\0" >> -#define FUSE_OPT_NOMULTI "-s\0\0" >> - >> -static const uint32_t default_major = 231; >> -static const uint32_t default_minor = 1; >> -static const char cuse_device_name[] = "/dev/cuse"; >> -static const char default_cdev[] = "vhost-net"; >> - >> -static struct fuse_session *session; >> -static struct vhost_net_device_ops const *ops; >> - >> -/* >> - * Returns vhost_device_ctx from given fuse_req_t. The index is populated later >> - * when the device is added to the device linked list. >> - */ >> -static struct vhost_device_ctx >> -fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) >> -{ >> - struct vhost_device_ctx ctx; >> - struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); >> - >> - ctx.pid = req_ctx->pid; >> - ctx.fh = fi->fh; >> - >> - return ctx; >> -} >> - >> -/* >> - * When the device is created in QEMU it gets initialised here and >> - * added to the device linked list. >> - */ >> -static void >> -vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) >> -{ >> - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); >> - int err = 0; >> - >> - err = ops->new_device(ctx); >> - if (err == -1) { >> - fuse_reply_err(req, EPERM); >> - return; >> - } >> - >> - fi->fh = err; >> - >> - RTE_LOG(INFO, VHOST_CONFIG, >> - "(%"PRIu64") Device configuration started\n", fi->fh); >> - fuse_reply_open(req, fi); >> -} >> - >> -/* >> - * When QEMU is shutdown or killed the device gets released. >> - */ >> -static void >> -vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) >> -{ >> - int err = 0; >> - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); >> - >> - ops->destroy_device(ctx); >> - RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); >> - fuse_reply_err(req, err); >> -} >> - >> -/* >> - * Boilerplate code for CUSE IOCTL >> - * Implicit arguments: ctx, req, result. >> - */ >> -#define VHOST_IOCTL(func) do { \ >> - result = (func)(ctx); \ >> - fuse_reply_ioctl(req, result, NULL, 0); \ >> -} while (0) >> - >> -/* >> - * Boilerplate IOCTL RETRY >> - * Implicit arguments: req. >> - */ >> -#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ >> - struct iovec iov_r = { arg, (size_r) }; \ >> - struct iovec iov_w = { arg, (size_w) }; \ >> - fuse_reply_ioctl_retry(req, &iov_r, \ >> - (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ >> -} while (0) >> - >> -/* >> - * Boilerplate code for CUSE Read IOCTL >> - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. >> - */ >> -#define VHOST_IOCTL_R(type, var, func) do { \ >> - if (!in_bufsz) { \ >> - VHOST_IOCTL_RETRY(sizeof(type), 0);\ >> - } else { \ >> - (var) = *(const type*)in_buf; \ >> - result = func(ctx, &(var)); \ >> - fuse_reply_ioctl(req, result, NULL, 0);\ >> - } \ >> -} while (0) >> - >> -/* >> - * Boilerplate code for CUSE Write IOCTL >> - * Implicit arguments: ctx, req, result, out_bufsz. >> - */ >> -#define VHOST_IOCTL_W(type, var, func) do { \ >> - if (!out_bufsz) { \ >> - VHOST_IOCTL_RETRY(0, sizeof(type));\ >> - } else { \ >> - result = (func)(ctx, &(var));\ >> - fuse_reply_ioctl(req, result, &(var), sizeof(type));\ >> - } \ >> -} while (0) >> - >> -/* >> - * Boilerplate code for CUSE Read/Write IOCTL >> - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. >> - */ >> -#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ >> - if (!in_bufsz) { \ >> - VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ >> - } else { \ >> - (var1) = *(const type1*) (in_buf); \ >> - result = (func)(ctx, (var1), &(var2)); \ >> - fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ >> - } \ >> -} while (0) >> - >> -/* >> - * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on the type >> - * of IOCTL a buffer is requested to read or to write. This request is handled >> - * by FUSE and the buffer is then given to CUSE. >> - */ >> -static void >> -vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, >> - struct fuse_file_info *fi, __rte_unused unsigned flags, >> - const void *in_buf, size_t in_bufsz, size_t out_bufsz) >> -{ >> - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); >> - struct vhost_vring_file file; >> - struct vhost_vring_state state; >> - struct vhost_vring_addr addr; >> - uint64_t features; >> - uint32_t index; >> - int result = 0; >> - >> - switch (cmd) { >> - case VHOST_NET_SET_BACKEND: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); >> - VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); >> - break; >> - >> - case VHOST_GET_FEATURES: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); >> - VHOST_IOCTL_W(uint64_t, features, ops->get_features); >> - break; >> - >> - case VHOST_SET_FEATURES: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); >> - VHOST_IOCTL_R(uint64_t, features, ops->set_features); >> - break; >> - >> - case VHOST_RESET_OWNER: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); >> - VHOST_IOCTL(ops->reset_owner); >> - break; >> - >> - case VHOST_SET_OWNER: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); >> - VHOST_IOCTL(ops->set_owner); >> - break; >> - >> - case VHOST_SET_MEM_TABLE: >> - /*TODO fix race condition.*/ >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); >> - static struct vhost_memory mem_temp; >> - >> - switch (in_bufsz) { >> - case 0: >> - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); >> - break; >> - >> - case sizeof(struct vhost_memory): >> - mem_temp = *(const struct vhost_memory *) in_buf; >> - >> - if (mem_temp.nregions > 0) { >> - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + >> - (sizeof(struct vhost_memory_region) * >> - mem_temp.nregions), 0); >> - } else { >> - result = -1; >> - fuse_reply_ioctl(req, result, NULL, 0); >> - } >> - break; >> - >> - default: >> - result = ops->set_mem_table(ctx, >> - in_buf, mem_temp.nregions); >> - if (result) >> - fuse_reply_err(req, EINVAL); >> - else >> - fuse_reply_ioctl(req, result, NULL, 0); >> - } >> - break; >> - >> - case VHOST_SET_VRING_NUM: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); >> - VHOST_IOCTL_R(struct vhost_vring_state, state, >> - ops->set_vring_num); >> - break; >> - >> - case VHOST_SET_VRING_BASE: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); >> - VHOST_IOCTL_R(struct vhost_vring_state, state, >> - ops->set_vring_base); >> - break; >> - >> - case VHOST_GET_VRING_BASE: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); >> - VHOST_IOCTL_RW(uint32_t, index, >> - struct vhost_vring_state, state, ops->get_vring_base); >> - break; >> - >> - case VHOST_SET_VRING_ADDR: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); >> - VHOST_IOCTL_R(struct vhost_vring_addr, addr, >> - ops->set_vring_addr); >> - break; >> - >> - case VHOST_SET_VRING_KICK: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_VRING_KICK\n", ctx.fh); >> - VHOST_IOCTL_R(struct vhost_vring_file, file, >> - ops->set_vring_kick); >> - break; >> - >> - case VHOST_SET_VRING_CALL: >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: VHOST_SET_VRING_CALL\n", ctx.fh); >> - VHOST_IOCTL_R(struct vhost_vring_file, file, >> - ops->set_vring_call); >> - break; >> - >> - default: >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); >> - result = -1; >> - fuse_reply_ioctl(req, result, NULL, 0); >> - } >> - >> - if (result < 0) >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); >> - else >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); >> -} >> - >> -/* >> - * Structure handling open, release and ioctl function pointers is populated. >> - */ >> -static const struct cuse_lowlevel_ops vhost_net_ops = { >> - .open = vhost_net_open, >> - .release = vhost_net_release, >> - .ioctl = vhost_net_ioctl, >> -}; >> - >> -/* >> - * cuse_info is populated and used to register the cuse device. >> - * vhost_net_device_ops are also passed when the device is registered in app. >> - */ >> -int >> -rte_vhost_driver_register(const char *dev_name) >> -{ >> - struct cuse_info cuse_info; >> - char device_name[PATH_MAX] = ""; >> - char char_device_name[PATH_MAX] = ""; >> - const char *device_argv[] = { device_name }; >> - >> - char fuse_opt_dummy[] = FUSE_OPT_DUMMY; >> - char fuse_opt_fore[] = FUSE_OPT_FORE; >> - char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; >> - char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; >> - >> - if (access(cuse_device_name, R_OK | W_OK) < 0) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "char device %s can't be accessed, maybe not exist\n", >> - cuse_device_name); >> - return -1; >> - } >> - >> - /* >> - * The device name is created. This is passed to QEMU so that it can >> - * register the device with our application. >> - */ >> - snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); >> - snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); >> - >> - /* Check if device already exists. */ >> - if (access(char_device_name, F_OK) != -1) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "char device %s already exists\n", char_device_name); >> - return -1; >> - } >> - >> - memset(&cuse_info, 0, sizeof(cuse_info)); >> - cuse_info.dev_major = default_major; >> - cuse_info.dev_minor = default_minor; >> - cuse_info.dev_info_argc = 1; >> - cuse_info.dev_info_argv = device_argv; >> - cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; >> - >> - ops = get_virtio_net_callbacks(); >> - >> - session = cuse_lowlevel_setup(3, fuse_argv, >> - &cuse_info, &vhost_net_ops, 0, NULL); >> - if (session == NULL) >> - return -1; >> - >> - return 0; >> -} >> - >> -/** >> - * The CUSE session is launched allowing the application to receive open, >> - * release and ioctl calls. >> - */ >> -int >> -rte_vhost_driver_session_start(void) >> -{ >> - fuse_session_loop(session); >> - >> - return 0; >> -} >> diff --git a/lib/librte_vhost/vhost-net-cdev.h b/lib/librte_vhost/vhost-net-cdev.h >> deleted file mode 100644 >> index 03a5c57..0000000 >> --- a/lib/librte_vhost/vhost-net-cdev.h >> +++ /dev/null >> @@ -1,113 +0,0 @@ >> -/*- >> - * BSD LICENSE >> - * >> - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> - * All rights reserved. >> - * >> - * Redistribution and use in source and binary forms, with or without >> - * modification, are permitted provided that the following conditions >> - * are met: >> - * >> - * * Redistributions of source code must retain the above copyright >> - * notice, this list of conditions and the following disclaimer. >> - * * Redistributions in binary form must reproduce the above copyright >> - * notice, this list of conditions and the following disclaimer in >> - * the documentation and/or other materials provided with the >> - * distribution. >> - * * Neither the name of Intel Corporation nor the names of its >> - * contributors may be used to endorse or promote products derived >> - * from this software without specific prior written permission. >> - * >> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> - */ >> - >> -#ifndef _VHOST_NET_CDEV_H_ >> -#define _VHOST_NET_CDEV_H_ >> -#include <stdint.h> >> -#include <stdio.h> >> -#include <sys/types.h> >> -#include <unistd.h> >> -#include <linux/vhost.h> >> - >> -#include <rte_log.h> >> - >> -/* Macros for printing using RTE_LOG */ >> -#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 >> -#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 >> - >> -#ifdef RTE_LIBRTE_VHOST_DEBUG >> -#define VHOST_MAX_PRINT_BUFF 6072 >> -#define LOG_LEVEL RTE_LOG_DEBUG >> -#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) >> -#define PRINT_PACKET(device, addr, size, header) do { \ >> - char *pkt_addr = (char *)(addr); \ >> - unsigned int index; \ >> - char packet[VHOST_MAX_PRINT_BUFF]; \ >> - \ >> - if ((header)) \ >> - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ >> - else \ >> - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ >> - for (index = 0; index < (size); index++) { \ >> - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ >> - "%02hhx ", pkt_addr[index]); \ >> - } \ >> - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ >> - \ >> - LOG_DEBUG(VHOST_DATA, "%s", packet); \ >> -} while (0) >> -#else >> -#define LOG_LEVEL RTE_LOG_INFO >> -#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) >> -#define PRINT_PACKET(device, addr, size, header) do {} while (0) >> -#endif >> - >> - >> -/* >> - * Structure used to identify device context. >> - */ >> -struct vhost_device_ctx { >> - pid_t pid; /* PID of process calling the IOCTL. */ >> - uint64_t fh; /* Populated with fi->fh to track the device index. */ >> -}; >> - >> -/* >> - * Structure contains function pointers to be defined in virtio-net.c. These >> - * functions are called in CUSE context and are used to configure devices. >> - */ >> -struct vhost_net_device_ops { >> - int (*new_device)(struct vhost_device_ctx); >> - void (*destroy_device)(struct vhost_device_ctx); >> - >> - int (*get_features)(struct vhost_device_ctx, uint64_t *); >> - int (*set_features)(struct vhost_device_ctx, uint64_t *); >> - >> - int (*set_mem_table)(struct vhost_device_ctx, const void *, uint32_t); >> - >> - int (*set_vring_num)(struct vhost_device_ctx, struct vhost_vring_state *); >> - int (*set_vring_addr)(struct vhost_device_ctx, struct vhost_vring_addr *); >> - int (*set_vring_base)(struct vhost_device_ctx, struct vhost_vring_state *); >> - int (*get_vring_base)(struct vhost_device_ctx, uint32_t, struct vhost_vring_state *); >> - >> - int (*set_vring_kick)(struct vhost_device_ctx, struct vhost_vring_file *); >> - int (*set_vring_call)(struct vhost_device_ctx, struct vhost_vring_file *); >> - >> - int (*set_backend)(struct vhost_device_ctx, struct vhost_vring_file *); >> - >> - int (*set_owner)(struct vhost_device_ctx); >> - int (*reset_owner)(struct vhost_device_ctx); >> -}; >> - >> - >> -struct vhost_net_device_ops const *get_virtio_net_callbacks(void); >> -#endif /* _VHOST_NET_CDEV_H_ */ >> diff --git a/lib/librte_vhost/vhost-user/fd_man.c b/lib/librte_vhost/vhost-user/fd_man.c >> new file mode 100644 >> index 0000000..c7fd3f2 >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-user/fd_man.c >> @@ -0,0 +1,158 @@ >> +#include <stdint.h> >> +#include <stdio.h> >> +#include <stdlib.h> >> +#include <sys/socket.h> >> +#include <sys/select.h> >> +#include <sys/time.h> >> +#include <sys/types.h> >> +#include <unistd.h> >> + >> +#include <rte_log.h> >> + >> +#include "fd_man.h" >> + >> +/** >> + * Returns the index in the fdset for a fd. >> + * If fd is -1, it means to search for a free entry. >> + * @return >> + * Index for the fd, or -1 if fd isn't in the fdset. >> + */ >> +static int >> +fdset_find_fd(struct fdset *pfdset, int fd) >> +{ >> + int i; >> + >> + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++); >> + >> + return i == pfdset->num ? -1 : i; >> +} >> + >> +static int >> +fdset_find_free_slot(struct fdset *pfdset) >> +{ >> + return fdset_find_fd(pfdset, -1); >> + >> +} >> + >> +static void >> +fdset_add_fd(struct fdset *pfdset, int idx, int fd, fd_cb rcb, >> + fd_cb wcb, uint64_t dat) >> +{ >> + struct fdentry *pfdentry = &pfdset->fd[idx]; >> + >> + pfdentry->fd = fd; >> + pfdentry->rcb = rcb; >> + pfdentry->wcb = wcb; >> + pfdentry->dat = dat; >> +} >> + >> +/** >> + * Fill the read/write fdset with the fds in the fdset. >> + * @return >> + * the maximum fds filled in the read/write fd_set. >> + */ >> +static int >> +fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) >> +{ >> + struct fdentry *pfdentry; >> + int i, maxfds = -1; >> + int num = MAX_FDS; >> + >> + for (i = 0; i < num ; i++) { >> + pfdentry = &pfdset->fd[i]; >> + if (pfdentry->fd != -1) { >> + int added = 0; >> + if (pfdentry->rcb && rfset) { >> + FD_SET(pfdentry->fd, rfset); >> + added = 1; >> + } >> + if (pfdentry->wcb && wfset) { >> + FD_SET(pfdentry->fd, wfset); >> + added = 1; >> + } >> + if (added) >> + maxfds = pfdentry->fd < maxfds ? >> + maxfds : pfdentry->fd; >> + } >> + } >> + return maxfds; >> +} >> + >> +void >> +fdset_init(struct fdset *pfdset) >> +{ >> + int i; >> + >> + for (i = 0; i < MAX_FDS; i++) >> + pfdset->fd[i].fd = -1; >> + pfdset->num = MAX_FDS; >> + >> +} >> + >> +/** >> + * Register the fd in the fdset with its read/write handler and context. >> + */ >> +int >> +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, uint64_t dat) >> +{ >> + int i; >> + >> + if (fd == -1) >> + return -1; >> + >> + /* Find a free slot in the list. */ >> + i = fdset_find_free_slot(pfdset); >> + if (i == -1) >> + return -2; >> + >> + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); >> + >> + return 0; >> +} >> + >> +/** >> + * Unregister the fd from the fdset. >> + */ >> +void >> +fdset_del(struct fdset *pfdset, int fd) >> +{ >> + int i; >> + >> + i = fdset_find_fd(pfdset, fd); >> + if (i != -1) { >> + pfdset->fd[i].fd = -1; >> + } >> +} >> + >> + >> +void >> +fdset_event_dispatch(struct fdset *pfdset) >> +{ >> + fd_set rfds,wfds; >> + int i, maxfds; >> + struct fdentry *pfdentry; >> + int num = MAX_FDS; >> + >> + if (pfdset == NULL) >> + return; >> + while (1) { >> + FD_ZERO(&rfds); >> + FD_ZERO(&wfds); >> + maxfds = fdset_fill(&rfds, &wfds, pfdset); >> + /* fd management runs in one thread */ >> + if (maxfds == -1) { >> + return; >> + } >> + >> + select(maxfds + 1, &rfds, &wfds, NULL, NULL); >> + >> + for (i = 0; i < num; i++) { >> + pfdentry = &pfdset->fd[i]; >> + if (FD_ISSET(pfdentry->fd, &rfds)) >> + pfdentry->rcb(pfdentry->fd, pfdentry->dat); >> + if (FD_ISSET(pfdentry->fd, &wfds)) >> + pfdentry->wcb(pfdentry->fd, pfdentry->dat); >> + } >> + >> + } >> +} >> diff --git a/lib/librte_vhost/vhost-user/fd_man.h b/lib/librte_vhost/vhost-user/fd_man.h >> new file mode 100644 >> index 0000000..57cc81d >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-user/fd_man.h >> @@ -0,0 +1,31 @@ >> +#ifndef _FD_MAN_H_ >> +#define _FD_MAN_H_ >> +#include <stdint.h> >> + >> +#define MAX_FDS 1024 >> + >> +typedef void (*fd_cb)(int fd, uint64_t dat); >> + >> +struct fdentry { >> + int fd; /* -1 indicates this entry is empty */ >> + fd_cb rcb; /* callback when this fd is readable. */ >> + fd_cb wcb; /* callback when this fd is writeable.*/ >> + uint64_t dat; /* fd context */ >> +}; >> + >> +struct fdset { >> + struct fdentry fd[MAX_FDS]; >> + int num; >> +}; >> + >> + >> +void fdset_init(struct fdset *pfdset); >> + >> +int fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, >> + fd_cb wcb, uint64_t ctx); >> + >> +void fdset_del(struct fdset *pfdset, int fd); >> + >> +void fdset_event_dispatch(struct fdset *pfdset); >> + >> +#endif >> diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.c b/lib/librte_vhost/vhost-user/vhost-net-user.c >> new file mode 100644 >> index 0000000..34450f4 >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-user/vhost-net-user.c >> @@ -0,0 +1,417 @@ >> +/*- >> + * BSD LICENSE >> + * >> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> + * All rights reserved. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * >> + * * Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * * Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in >> + * the documentation and/or other materials provided with the >> + * distribution. >> + * * Neither the name of Intel Corporation nor the names of its >> + * contributors may be used to endorse or promote products derived >> + * from this software without specific prior written permission. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> + */ >> + >> +#include <stdint.h> >> +#include <stdio.h> >> +#include <limits.h> >> +#include <stdlib.h> >> +#include <unistd.h> >> +#include <string.h> >> +#include <sys/types.h> >> +#include <sys/socket.h> >> +#include <sys/un.h> >> +#include <errno.h> >> + >> +#include <rte_log.h> >> +#include <rte_virtio_net.h> >> + >> +#include "fd_man.h" >> +#include "vhost-net-user.h" >> +#include "vhost-net.h" >> +#include "virtio-net-user.h" >> + >> +static void vserver_new_vq_conn(int fd, uint64_t data); >> +static void vserver_message_handler(int fd, uint64_t dat); >> +const struct vhost_net_device_ops *ops; >> + >> +static struct vhost_server *g_vhost_server; >> + >> +static const char *vhost_message_str[VHOST_USER_MAX] = >> +{ >> + [VHOST_USER_NONE] = "VHOST_USER_NONE", >> + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", >> + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", >> + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", >> + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", >> + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", >> + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", >> + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", >> + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", >> + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", >> + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", >> + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", >> + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", >> + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", >> + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR" >> +}; >> + >> +/** >> + * Create a unix domain socket and bind to path. >> + * @return >> + * socket fd or -1 on failure >> + */ >> +static int >> +uds_socket(const char *path) >> +{ >> + struct sockaddr_un un; >> + int sockfd; >> + int ret; >> + >> + if (path == NULL) >> + return -1; >> + >> + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); >> + if (sockfd < 0) >> + return -1; >> + RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd); >> + >> + memset(&un, 0, sizeof(un)); >> + un.sun_family = AF_UNIX; >> + snprintf(un.sun_path, sizeof(un.sun_path), "%s", path); >> + ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un)); >> + if (ret == -1) >> + goto err; >> + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); >> + >> + ret = listen(sockfd, 1); >> + if (ret == -1) >> + goto err; >> + >> + return sockfd; >> + >> +err: >> + close(sockfd); >> + return -1; >> +} >> + >> + >> +/* return bytes# of read */ >> +static int >> +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) >> +{ >> + >> + struct iovec iov; >> + struct msghdr msgh = { 0 }; >> + size_t fdsize = fd_num * sizeof(int); >> + char control[CMSG_SPACE(fdsize)]; >> + struct cmsghdr *cmsg; >> + int ret; >> + >> + iov.iov_base = buf; >> + iov.iov_len = buflen; >> + >> + msgh.msg_iov = &iov; >> + msgh.msg_iovlen = 1; >> + msgh.msg_control = control; >> + msgh.msg_controllen = sizeof(control); >> + >> + ret = recvmsg(sockfd, &msgh, 0); >> + if (ret <= 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); >> + return ret; >> + } >> + /* ret == buflen */ >> + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { >> + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); >> + return -1; >> + } >> + >> + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; >> + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { >> + if ( (cmsg->cmsg_level == SOL_SOCKET) && >> + (cmsg->cmsg_type == SCM_RIGHTS)) { >> + memcpy(fds, CMSG_DATA(cmsg), fdsize); >> + break; >> + } >> + } >> + return ret; >> +} >> + >> +static int >> +read_vhost_message(int sockfd, struct VhostUserMsg *msg) >> +{ >> + int ret; >> + >> + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, >> + msg->fds, VHOST_MEMORY_MAX_NREGIONS); >> + if (ret <= 0) >> + return ret; >> + >> + if (msg->size) { >> + if (msg->size > sizeof(msg->payload)) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "%s: invalid size:%d\n", __func__, msg->size); >> + return -1; >> + } >> + ret = read(sockfd, &msg->payload, msg->size); >> + if (ret == 0) >> + return 0; >> + if (ret != (int)msg->size) { >> + printf("read control message failed\n"); >> + return -1; >> + } >> + } >> + >> + return ret; >> +} >> + >> +static int >> +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) >> +{ >> + >> + struct iovec iov; >> + struct msghdr msgh = { 0 }; >> + size_t fdsize = fd_num * sizeof(int); >> + char control[CMSG_SPACE(fdsize)]; >> + struct cmsghdr *cmsg; >> + int ret; >> + >> + iov.iov_base = buf; >> + iov.iov_len = buflen; >> + msgh.msg_iov = &iov; >> + msgh.msg_iovlen = 1; >> + >> + if (fds && fd_num > 0) { >> + msgh.msg_control = control; >> + msgh.msg_controllen = sizeof(control); >> + cmsg = CMSG_FIRSTHDR(&msgh); >> + cmsg->cmsg_len = CMSG_LEN(fdsize); >> + cmsg->cmsg_level = SOL_SOCKET; >> + cmsg->cmsg_type = SCM_RIGHTS; >> + memcpy(CMSG_DATA(cmsg), fds, fdsize); >> + } else { >> + msgh.msg_control = NULL; >> + msgh.msg_controllen = 0; >> + } >> + >> + do { >> + ret = sendmsg(sockfd, &msgh, 0); >> + } while (ret < 0 && errno == EINTR); >> + >> + if (ret < 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); >> + return -1; >> + } >> + >> + return 0; >> +} >> + >> +static int >> +send_vhost_message(int sockfd, struct VhostUserMsg *msg) >> +{ >> + int ret; >> + >> + msg->flags &= ~VHOST_USER_VERSION_MASK; >> + msg->flags |= VHOST_USER_VERSION; >> + msg->flags |= VHOST_USER_REPLY_MASK; >> + >> + ret = send_fd_message(sockfd, (char *)msg, >> + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); >> + >> + return ret; >> +} >> + >> +/* call back when there is new connection. */ >> +static void >> +vserver_new_vq_conn(int fd, uint64_t dat) >> +{ >> + struct vhost_server *vserver = (void *)(uintptr_t)dat; >> + int conn_fd; >> + uint32_t fh; >> + struct vhost_device_ctx vdev_ctx = { 0 }; >> + >> + conn_fd = accept(fd, NULL, NULL); >> + RTE_LOG(INFO, VHOST_CONFIG, >> + "%s: new connection is %d\n", __func__, conn_fd); >> + if (conn_fd < 0) >> + return; >> + >> + fh = ops->new_device(vdev_ctx); >> + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh); >> + >> + fdset_add(&vserver->fdset, >> + conn_fd, vserver_message_handler, NULL, fh); >> +} >> + >> +/* callback when there is message on the connfd */ >> +static void >> +vserver_message_handler(int connfd, uint64_t dat) >> +{ >> + struct vhost_device_ctx ctx; >> + uint32_t fh = (uint32_t)dat; >> + struct VhostUserMsg msg; >> + uint64_t features; >> + int ret; >> + >> + ctx.fh = fh; >> + ret = read_vhost_message(connfd, &msg); >> + if (ret < 0) { >> + printf("vhost read message failed\n"); >> + >> + /*TODO: cleanup */ >> + close(connfd); >> + fdset_del(&g_vhost_server->fdset, connfd); >> + ops->destroy_device(ctx); >> + >> + return; >> + } else if (ret == 0) { >> + /*TODO: cleanup */ >> + RTE_LOG(INFO, VHOST_CONFIG, >> + "vhost peer closed\n"); >> + close(connfd); >> + fdset_del(&g_vhost_server->fdset, connfd); >> + ops->destroy_device(ctx); >> + >> + return; >> + } >> + if (msg.request > VHOST_USER_MAX) { >> + /*TODO: cleanup */ >> + RTE_LOG(INFO, VHOST_CONFIG, >> + "vhost read incorrect message\n"); >> + close(connfd); >> + fdset_del(&g_vhost_server->fdset, connfd); >> + >> + return; >> + } >> + >> + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", >> + vhost_message_str[msg.request]); >> + switch (msg.request) { >> + case VHOST_USER_GET_FEATURES: >> + ret = ops->get_features(ctx, &features); >> + msg.payload.u64 = ret; >> + msg.size = sizeof(msg.payload.u64); >> + send_vhost_message(connfd, &msg); >> + break; >> + case VHOST_USER_SET_FEATURES: >> + ops->set_features(ctx, &features); >> + break; >> + >> + case VHOST_USER_SET_OWNER: >> + ops->set_owner(ctx); >> + break; >> + case VHOST_USER_RESET_OWNER: >> + ops->reset_owner(ctx); >> + break; >> + >> + case VHOST_USER_SET_MEM_TABLE: >> + user_set_mem_table(ctx, &msg); >> + break; >> + >> + case VHOST_USER_SET_LOG_BASE: >> + case VHOST_USER_SET_LOG_FD: >> + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); >> + break; >> + >> + case VHOST_USER_SET_VRING_NUM: >> + ops->set_vring_num(ctx, &msg.payload.state); >> + break; >> + case VHOST_USER_SET_VRING_ADDR: >> + ops->set_vring_addr(ctx, &msg.payload.addr); >> + break; >> + case VHOST_USER_SET_VRING_BASE: >> + ops->set_vring_base(ctx, &msg.payload.state); >> + break; >> + >> + case VHOST_USER_GET_VRING_BASE: >> + ret = ops->get_vring_base(ctx, msg.payload.state.index, >> + &msg.payload.state); >> + msg.size = sizeof(msg.payload.state); >> + send_vhost_message(connfd, &msg); >> + break; >> + >> + case VHOST_USER_SET_VRING_KICK: >> + user_set_vring_kick(ctx, &msg); >> + break; >> + case VHOST_USER_SET_VRING_CALL: >> + user_set_vring_call(ctx, &msg); >> + break; >> + >> + case VHOST_USER_SET_VRING_ERR: >> + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); >> + break; >> + >> + default: >> + break; >> + >> + } >> +} >> + >> + >> +/** >> + * Creates and initialise the vhost server. >> + */ >> +int >> +rte_vhost_driver_register(const char *path) >> +{ >> + >> + struct vhost_server *vserver; >> + >> + if (g_vhost_server != NULL) >> + return -1; >> + >> + vserver = calloc(sizeof(struct vhost_server), 1); >> + /*TODO: all allocation is through DPDK memory allocation */ >> + if (vserver == NULL) >> + return -1; >> + >> + fdset_init(&vserver->fdset); >> + >> + unlink(path); >> + >> + vserver->listenfd = uds_socket(path); >> + if (vserver->listenfd < 0) { >> + free(vserver); >> + return -1; >> + } >> + vserver->path = path; >> + >> + fdset_add(&vserver->fdset, vserver->listenfd, >> + vserver_new_vq_conn, NULL, >> + (uint64_t)(uintptr_t)vserver); >> + >> + ops = get_virtio_net_callbacks(); >> + >> + g_vhost_server = vserver; >> + >> + return 0; >> +} >> + >> + >> +int >> +rte_vhost_driver_session_start(void) >> +{ >> + fdset_event_dispatch(&g_vhost_server->fdset); >> + return 0; >> +} >> + >> diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.h b/lib/librte_vhost/vhost-user/vhost-net-user.h >> new file mode 100644 >> index 0000000..c9df9fa >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-user/vhost-net-user.h >> @@ -0,0 +1,74 @@ >> +#ifndef _VHOST_NET_USER_H >> +#define _VHOST_NET_USER_H >> +#include <stdint.h> >> +#include <linux/vhost.h> >> + >> +#include "fd_man.h" >> + >> +struct vhost_server { >> + const char *path; /**< The path the uds is bind to. */ >> + int listenfd; /**< The listener sockfd. */ >> + struct fdset fdset; /**< The fd list this vhost server manages. */ >> +}; >> + >> +/*********** FROM hw/virtio/vhost-user.c *************************************/ >> + >> +#define VHOST_MEMORY_MAX_NREGIONS 8 >> + >> +typedef enum VhostUserRequest { >> + VHOST_USER_NONE = 0, >> + VHOST_USER_GET_FEATURES = 1, >> + VHOST_USER_SET_FEATURES = 2, >> + VHOST_USER_SET_OWNER = 3, >> + VHOST_USER_RESET_OWNER = 4, >> + VHOST_USER_SET_MEM_TABLE = 5, >> + VHOST_USER_SET_LOG_BASE = 6, >> + VHOST_USER_SET_LOG_FD = 7, >> + VHOST_USER_SET_VRING_NUM = 8, >> + VHOST_USER_SET_VRING_ADDR = 9, >> + VHOST_USER_SET_VRING_BASE = 10, >> + VHOST_USER_GET_VRING_BASE = 11, >> + VHOST_USER_SET_VRING_KICK = 12, >> + VHOST_USER_SET_VRING_CALL = 13, >> + VHOST_USER_SET_VRING_ERR = 14, >> + VHOST_USER_MAX >> +} VhostUserRequest; >> + >> +typedef struct VhostUserMemoryRegion { >> + uint64_t guest_phys_addr; >> + uint64_t memory_size; >> + uint64_t userspace_addr; >> + uint64_t mmap_offset; >> +} VhostUserMemoryRegion; >> + >> +typedef struct VhostUserMemory { >> + uint32_t nregions; >> + uint32_t padding; >> + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; >> +} VhostUserMemory; >> + >> +typedef struct VhostUserMsg { >> + VhostUserRequest request; >> + >> +#define VHOST_USER_VERSION_MASK (0x3) >> +#define VHOST_USER_REPLY_MASK (0x1 << 2) >> + uint32_t flags; >> + uint32_t size; /* the following payload size */ >> + union { >> +#define VHOST_USER_VRING_IDX_MASK (0xff) >> +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) >> + uint64_t u64; >> + struct vhost_vring_state state; >> + struct vhost_vring_addr addr; >> + VhostUserMemory memory; >> + } payload; >> + int fds[VHOST_MEMORY_MAX_NREGIONS]; >> +} __attribute__((packed)) VhostUserMsg; >> + >> +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64) >> + >> +/* The version of the protocol we support */ >> +#define VHOST_USER_VERSION (0x1) >> + >> +/*****************************************************************************/ >> +#endif >> diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.c b/lib/librte_vhost/vhost-user/virtio-net-user.c >> new file mode 100644 >> index 0000000..f38e6cc >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-user/virtio-net-user.c >> @@ -0,0 +1,208 @@ >> +/*- >> + * BSD LICENSE >> + * >> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. >> + * All rights reserved. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * >> + * * Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * * Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in >> + * the documentation and/or other materials provided with the >> + * distribution. >> + * * Neither the name of Intel Corporation nor the names of its >> + * contributors may be used to endorse or promote products derived >> + * from this software without specific prior written permission. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR >> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT >> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, >> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT >> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, >> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY >> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT >> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE >> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> + */ >> + >> +#include <stdint.h> >> +#include <stdio.h> >> +#include <stdlib.h> >> +#include <unistd.h> >> +#include <sys/mman.h> >> + >> +#include <rte_log.h> >> + >> +#include "virtio-net-user.h" >> +#include "vhost-net-user.h" >> +#include "vhost-net.h" >> + >> +extern const struct vhost_net_device_ops *ops; >> + >> +#if 0 >> +int >> +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) >> +{ >> + unsigned int idx; >> + struct VhostUserMemory memory = pmsg->payload.memory; >> + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; >> + uint64_t mapped_address, base_address = 0, mem_size = 0; >> + >> + for (idx = 0; idx < memory.nregions; idx++) { >> + if (memory.regions[idx].guest_phys_addr == 0) >> + base_address = memory.regions[idx].userspace_addr; >> + } >> + if (base_address == 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "couldn't find the mem region whose gpa is 0.\n"); >> + return -1; >> + } >> + >> + for (idx = 0; idx < memory.nregions; idx++) { >> + uint64_t size = memory.regions[idx].userspace_addr - >> + base_address + memory.regions[idx].memory_size; >> + if (mem_size < size) >> + mem_size = size; >> + } >> + >> + /* >> + * here we assume qemu will map only one file for memory allocation, >> + * we only use fds[0] with offset 0. >> + */ >> + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mem_size, >> + PROT_READ | PROT_WRITE, MAP_SHARED, pmsg->fds[0], 0); >> + >> + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { >> + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); >> + return -1; >> + } >> + >> + for (idx = 0; idx < memory.nregions; idx++) { >> + regions[idx].guest_phys_address = >> + memory.regions[idx].guest_phys_addr; >> + regions[idx].guest_phys_address_end = >> + memory.regions[idx].guest_phys_addr + >> + memory.regions[idx].memory_size; >> + regions[idx].memory_size = memory.regions[idx].memory_size; >> + regions[idx].userspace_address = >> + memory.regions[idx].userspace_addr; >> + >> + regions[idx].address_offset = mapped_address - base_address + >> + regions[idx].userspace_address - >> + regions[idx].guest_phys_address; >> + LOG_DEBUG(VHOST_CONFIG, >> + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", >> + idx, >> + (void *)(uintptr_t)regions[idx].guest_phys_address, >> + (void *)(uintptr_t)regions[idx].userspace_address, >> + regions[idx].memory_size); >> + } >> + ops->set_mem_table(ctx, regions, memory.nregions); >> + return 0; >> +} >> + >> +#else >> + >> +int >> +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) >> +{ >> + unsigned int idx; >> + struct VhostUserMemory memory = pmsg->payload.memory; >> + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; >> + uint64_t mapped_address, base_address = 0; >> + >> + for (idx = 0; idx < memory.nregions; idx++) { >> + if (memory.regions[idx].guest_phys_addr == 0) >> + base_address = memory.regions[idx].userspace_addr; >> + } >> + if (base_address == 0) { >> + RTE_LOG(ERR, VHOST_CONFIG, >> + "couldn't find the mem region whose gpa is 0.\n"); >> + return -1; >> + } >> + >> + >> + for (idx = 0; idx < memory.nregions; idx++) { >> + regions[idx].guest_phys_address = >> + memory.regions[idx].guest_phys_addr; >> + regions[idx].guest_phys_address_end = >> + memory.regions[idx].guest_phys_addr + >> + memory.regions[idx].memory_size; >> + regions[idx].memory_size = memory.regions[idx].memory_size; >> + regions[idx].userspace_address = >> + memory.regions[idx].userspace_addr; >> +/* >> + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, >> + regions[idx].memory_size, >> + PROT_READ | PROT_WRITE, MAP_SHARED, >> + pmsg->fds[idx], >> + memory.regions[idx].mmap_offset); >> +*/ >> + >> +/* This is ugly */ >> + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, >> + regions[idx].memory_size + >> + memory.regions[idx].mmap_offset, >> + PROT_READ | PROT_WRITE, MAP_SHARED, >> + pmsg->fds[idx], >> + 0); >> + printf("mapped to %p\n", (void *)mapped_address); >> + >> + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { >> + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); >> + return -1; >> + } >> + >> +// printf("ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset + 0x3FFFFFFF) & ~0x3FFFFFFF)); >> +// printf("unaligned ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset ) )); >> + mapped_address += memory.regions[idx].mmap_offset; >> + >> + regions[idx].address_offset = mapped_address - >> + regions[idx].guest_phys_address; >> + LOG_DEBUG(VHOST_CONFIG, >> + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", >> + idx, >> + (void *)(uintptr_t)regions[idx].guest_phys_address, >> + (void *)(uintptr_t)regions[idx].userspace_address, >> + regions[idx].memory_size); >> + } >> + ops->set_mem_table(ctx, regions, memory.nregions); >> + return 0; >> +} >> + >> + >> + >> + >> +#endif >> + >> + >> +void >> +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) >> +{ >> + struct vhost_vring_file file; >> + >> + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; >> + file.fd = pmsg->fds[0]; >> + RTE_LOG(INFO, VHOST_CONFIG, >> + "vring call idx:%d file:%d\n", file.index, file.fd); >> + ops->set_vring_call(ctx, &file); >> +} >> + >> + >> +void >> +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) >> +{ >> + struct vhost_vring_file file; >> + >> + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; >> + file.fd = pmsg->fds[0]; >> + RTE_LOG(INFO, VHOST_CONFIG, >> + "vring kick idx:%d file:%d\n", file.index, file.fd); >> + ops->set_vring_kick(ctx, &file); >> +} >> diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.h b/lib/librte_vhost/vhost-user/virtio-net-user.h >> new file mode 100644 >> index 0000000..0969376 >> --- /dev/null >> +++ b/lib/librte_vhost/vhost-user/virtio-net-user.h >> @@ -0,0 +1,11 @@ >> +#ifndef _VIRTIO_NET_USER_H >> +#define _VIRTIO_NET_USER_H >> + >> +#include "vhost-net.h" >> +#include "vhost-net-user.h" >> + >> +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *); >> +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *); >> +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *); >> + >> +#endif >> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c >> index ccfd82f..8ff0301 100644 >> --- a/lib/librte_vhost/vhost_rxtx.c >> +++ b/lib/librte_vhost/vhost_rxtx.c >> @@ -38,19 +38,14 @@ >> #include <rte_memcpy.h> >> #include <rte_virtio_net.h> >> >> -#include "vhost-net-cdev.h" >> +#include "vhost-net.h" >> >> -#define MAX_PKT_BURST 32 >> +#define VHOST_MAX_PKT_BURST 64 >> +#define VHOST_MAX_MRG_PKT_BURST 64 >> >> -/** >> - * This function adds buffers to the virtio devices RX virtqueue. Buffers can >> - * be received from the physical port or from another virtio device. A packet >> - * count is returned to indicate the number of packets that are succesfully >> - * added to the RX queue. This function works when mergeable is disabled. >> - */ >> -static inline uint32_t __attribute__((always_inline)) >> -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, >> - struct rte_mbuf **pkts, uint32_t count) >> + >> +uint32_t >> +rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count) >> { >> struct vhost_virtqueue *vq; >> struct vring_desc *desc; >> @@ -59,26 +54,23 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, >> struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; >> uint64_t buff_addr = 0; >> uint64_t buff_hdr_addr = 0; >> - uint32_t head[MAX_PKT_BURST], packet_len = 0; >> + uint32_t head[VHOST_MAX_PKT_BURST], packet_len = 0; >> uint32_t head_idx, packet_success = 0; >> + uint32_t mergeable, mrg_count = 0; >> uint16_t avail_idx, res_cur_idx; >> uint16_t res_base_idx, res_end_idx; >> uint16_t free_entries; >> uint8_t success = 0; >> >> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); >> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s()\n", dev->device_fh, __func__); >> if (unlikely(queue_id != VIRTIO_RXQ)) { >> LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); >> return 0; >> } >> >> vq = dev->virtqueue[VIRTIO_RXQ]; >> - count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; >> - >> - /* >> - * As many data cores may want access to available buffers, >> - * they need to be reserved. >> - */ >> + count = (count > VHOST_MAX_PKT_BURST) ? VHOST_MAX_PKT_BURST : count; >> + /* As many data cores may want access to available buffers, they need to be reserved. */ >> do { >> res_base_idx = vq->last_used_idx_res; >> avail_idx = *((volatile uint16_t *)&vq->avail->idx); >> @@ -93,21 +85,25 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, >> >> res_end_idx = res_base_idx + count; >> /* vq->last_used_idx_res is atomically updated. */ >> - /* TODO: Allow to disable cmpset if no concurrency in application. */ >> + /* TODO: Allow to disable cmpset if no concurrency in application */ >> success = rte_atomic16_cmpset(&vq->last_used_idx_res, >> res_base_idx, res_end_idx); >> + /* If there is contention here and failed, try again. */ >> } while (unlikely(success == 0)); >> res_cur_idx = res_base_idx; >> LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", >> - dev->device_fh, res_cur_idx, res_end_idx); >> + dev->device_fh, >> + res_cur_idx, res_end_idx); >> >> /* Prefetch available ring to retrieve indexes. */ >> rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); >> >> + /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ >> + mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); >> + >> /* Retrieve all of the head indexes first to avoid caching issues. */ >> for (head_idx = 0; head_idx < count; head_idx++) >> - head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & >> - (vq->size - 1)]; >> + head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; >> >> /*Prefetch descriptor index. */ >> rte_prefetch0(&vq->desc[head[packet_success]]); >> @@ -123,46 +119,57 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, >> /* Prefetch buffer address. */ >> rte_prefetch0((void *)(uintptr_t)buff_addr); >> >> - /* Copy virtio_hdr to packet and increment buffer address */ >> - buff_hdr_addr = buff_addr; >> - packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; >> - >> - /* >> - * If the descriptors are chained the header and data are >> - * placed in separate buffers. >> - */ >> - if (desc->flags & VRING_DESC_F_NEXT) { >> - desc->len = vq->vhost_hlen; >> - desc = &vq->desc[desc->next]; >> - /* Buffer address translation. */ >> - buff_addr = gpa_to_vva(dev, desc->addr); >> - desc->len = rte_pktmbuf_data_len(buff); >> + if (mergeable && (mrg_count != 0)) { >> + desc->len = packet_len = rte_pktmbuf_data_len(buff); >> } else { >> - buff_addr += vq->vhost_hlen; >> - desc->len = packet_len; >> + /* Copy virtio_hdr to packet and increment buffer address */ >> + buff_hdr_addr = buff_addr; >> + packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; >> + >> + /* >> + * If the descriptors are chained the header and data are placed in >> + * separate buffers. >> + */ >> + if (desc->flags & VRING_DESC_F_NEXT) { >> + desc->len = vq->vhost_hlen; >> + desc = &vq->desc[desc->next]; >> + /* Buffer address translation. */ >> + buff_addr = gpa_to_vva(dev, desc->addr); >> + desc->len = rte_pktmbuf_data_len(buff); >> + } else { >> + buff_addr += vq->vhost_hlen; >> + desc->len = packet_len; >> + } >> } >> >> + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0); >> + >> /* Update used ring with desc information */ >> - vq->used->ring[res_cur_idx & (vq->size - 1)].id = >> - head[packet_success]; >> + vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; >> vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; >> >> /* Copy mbuf data to buffer */ >> - /* FIXME for sg mbuf and the case that desc couldn't hold the mbuf data */ >> - rte_memcpy((void *)(uintptr_t)buff_addr, >> - rte_pktmbuf_mtod(buff, const void *), >> - rte_pktmbuf_data_len(buff)); >> - PRINT_PACKET(dev, (uintptr_t)buff_addr, >> - rte_pktmbuf_data_len(buff), 0); >> + /* TODO fixme for sg mbuf and the case that desc couldn't hold the mbuf data */ >> + rte_memcpy((void *)(uintptr_t)buff_addr, (const void *)buff->pkt.data, rte_pktmbuf_data_len(buff)); >> >> res_cur_idx++; >> packet_success++; >> >> - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, >> - (const void *)&virtio_hdr, vq->vhost_hlen); >> - >> - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); >> - >> + /* If mergeable is disabled then a header is required per buffer. */ >> + if (!mergeable) { >> + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); >> + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); >> + } else { >> + mrg_count++; >> + /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */ >> + if ((mrg_count == VHOST_MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) { >> + virtio_hdr.num_buffers = mrg_count; >> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); >> + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); >> + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); >> + mrg_count = 0; >> + } >> + } >> if (res_cur_idx < res_end_idx) { >> /* Prefetch descriptor index. */ >> rte_prefetch0(&vq->desc[head[packet_success]]); >> @@ -184,357 +191,18 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, >> return count; >> } >> >> -static inline uint32_t __attribute__((always_inline)) >> -copy_from_mbuf_to_vring(struct virtio_net *dev, uint16_t res_base_idx, >> - uint16_t res_end_idx, struct rte_mbuf *pkt) >> -{ >> - uint32_t vec_idx = 0; >> - uint32_t entry_success = 0; >> - struct vhost_virtqueue *vq; >> - /* The virtio_hdr is initialised to 0. */ >> - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = { >> - {0, 0, 0, 0, 0, 0}, 0}; >> - uint16_t cur_idx = res_base_idx; >> - uint64_t vb_addr = 0; >> - uint64_t vb_hdr_addr = 0; >> - uint32_t seg_offset = 0; >> - uint32_t vb_offset = 0; >> - uint32_t seg_avail; >> - uint32_t vb_avail; >> - uint32_t cpy_len, entry_len; >> - >> - if (pkt == NULL) >> - return 0; >> - >> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " >> - "End Index %d\n", >> - dev->device_fh, cur_idx, res_end_idx); >> - >> - /* >> - * Convert from gpa to vva >> - * (guest physical addr -> vhost virtual addr) >> - */ >> - vq = dev->virtqueue[VIRTIO_RXQ]; >> - vb_addr = >> - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); >> - vb_hdr_addr = vb_addr; >> - >> - /* Prefetch buffer address. */ >> - rte_prefetch0((void *)(uintptr_t)vb_addr); >> - >> - virtio_hdr.num_buffers = res_end_idx - res_base_idx; >> - >> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", >> - dev->device_fh, virtio_hdr.num_buffers); >> >> - rte_memcpy((void *)(uintptr_t)vb_hdr_addr, >> - (const void *)&virtio_hdr, vq->vhost_hlen); >> - >> - PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); >> - >> - seg_avail = rte_pktmbuf_data_len(pkt); >> - vb_offset = vq->vhost_hlen; >> - vb_avail = >> - vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; >> - >> - entry_len = vq->vhost_hlen; >> - >> - if (vb_avail == 0) { >> - uint32_t desc_idx = >> - vq->buf_vec[vec_idx].desc_idx; >> - vq->desc[desc_idx].len = vq->vhost_hlen; >> - >> - if ((vq->desc[desc_idx].flags >> - & VRING_DESC_F_NEXT) == 0) { >> - /* Update used ring with desc information */ >> - vq->used->ring[cur_idx & (vq->size - 1)].id >> - = vq->buf_vec[vec_idx].desc_idx; >> - vq->used->ring[cur_idx & (vq->size - 1)].len >> - = entry_len; >> - >> - entry_len = 0; >> - cur_idx++; >> - entry_success++; >> - } >> - >> - vec_idx++; >> - vb_addr = >> - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); >> - >> - /* Prefetch buffer address. */ >> - rte_prefetch0((void *)(uintptr_t)vb_addr); >> - vb_offset = 0; >> - vb_avail = vq->buf_vec[vec_idx].buf_len; >> - } >> - >> - cpy_len = RTE_MIN(vb_avail, seg_avail); >> - >> - while (cpy_len > 0) { >> - /* Copy mbuf data to vring buffer */ >> - rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), >> - (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset), >> - cpy_len); >> - >> - PRINT_PACKET(dev, >> - (uintptr_t)(vb_addr + vb_offset), >> - cpy_len, 0); >> - >> - seg_offset += cpy_len; >> - vb_offset += cpy_len; >> - seg_avail -= cpy_len; >> - vb_avail -= cpy_len; >> - entry_len += cpy_len; >> - >> - if (seg_avail != 0) { >> - /* >> - * The virtio buffer in this vring >> - * entry reach to its end. >> - * But the segment doesn't complete. >> - */ >> - if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & >> - VRING_DESC_F_NEXT) == 0) { >> - /* Update used ring with desc information */ >> - vq->used->ring[cur_idx & (vq->size - 1)].id >> - = vq->buf_vec[vec_idx].desc_idx; >> - vq->used->ring[cur_idx & (vq->size - 1)].len >> - = entry_len; >> - entry_len = 0; >> - cur_idx++; >> - entry_success++; >> - } >> - >> - vec_idx++; >> - vb_addr = gpa_to_vva(dev, >> - vq->buf_vec[vec_idx].buf_addr); >> - vb_offset = 0; >> - vb_avail = vq->buf_vec[vec_idx].buf_len; >> - cpy_len = RTE_MIN(vb_avail, seg_avail); >> - } else { >> - /* >> - * This current segment complete, need continue to >> - * check if the whole packet complete or not. >> - */ >> - pkt = pkt->next; >> - if (pkt != NULL) { >> - /* >> - * There are more segments. >> - */ >> - if (vb_avail == 0) { >> - /* >> - * This current buffer from vring is >> - * used up, need fetch next buffer >> - * from buf_vec. >> - */ >> - uint32_t desc_idx = >> - vq->buf_vec[vec_idx].desc_idx; >> - vq->desc[desc_idx].len = vb_offset; >> - >> - if ((vq->desc[desc_idx].flags & >> - VRING_DESC_F_NEXT) == 0) { >> - uint16_t wrapped_idx = >> - cur_idx & (vq->size - 1); >> - /* >> - * Update used ring with the >> - * descriptor information >> - */ >> - vq->used->ring[wrapped_idx].id >> - = desc_idx; >> - vq->used->ring[wrapped_idx].len >> - = entry_len; >> - entry_success++; >> - entry_len = 0; >> - cur_idx++; >> - } >> - >> - /* Get next buffer from buf_vec. */ >> - vec_idx++; >> - vb_addr = gpa_to_vva(dev, >> - vq->buf_vec[vec_idx].buf_addr); >> - vb_avail = >> - vq->buf_vec[vec_idx].buf_len; >> - vb_offset = 0; >> - } >> - >> - seg_offset = 0; >> - seg_avail = rte_pktmbuf_data_len(pkt); >> - cpy_len = RTE_MIN(vb_avail, seg_avail); >> - } else { >> - /* >> - * This whole packet completes. >> - */ >> - uint32_t desc_idx = >> - vq->buf_vec[vec_idx].desc_idx; >> - vq->desc[desc_idx].len = vb_offset; >> - >> - while (vq->desc[desc_idx].flags & >> - VRING_DESC_F_NEXT) { >> - desc_idx = vq->desc[desc_idx].next; >> - vq->desc[desc_idx].len = 0; >> - } >> - >> - /* Update used ring with desc information */ >> - vq->used->ring[cur_idx & (vq->size - 1)].id >> - = vq->buf_vec[vec_idx].desc_idx; >> - vq->used->ring[cur_idx & (vq->size - 1)].len >> - = entry_len; >> - entry_len = 0; >> - cur_idx++; >> - entry_success++; >> - seg_avail = 0; >> - cpy_len = RTE_MIN(vb_avail, seg_avail); >> - } >> - } >> - } >> - >> - return entry_success; >> -} >> - >> -/* >> - * This function works for mergeable RX. >> - */ >> -static inline uint32_t __attribute__((always_inline)) >> -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, >> - struct rte_mbuf **pkts, uint32_t count) >> +uint32_t >> +rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) >> { >> - struct vhost_virtqueue *vq; >> - uint32_t pkt_idx = 0, entry_success = 0; >> - uint16_t avail_idx, res_cur_idx; >> - uint16_t res_base_idx, res_end_idx; >> - uint8_t success = 0; >> - >> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", >> - dev->device_fh); >> - if (unlikely(queue_id != VIRTIO_RXQ)) { >> - LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); >> - } >> - >> - vq = dev->virtqueue[VIRTIO_RXQ]; >> - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); >> - >> - if (count == 0) >> - return 0; >> - >> - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { >> - uint32_t secure_len = 0; >> - uint16_t need_cnt; >> - uint32_t vec_idx = 0; >> - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen; >> - uint16_t i, id; >> - >> - do { >> - /* >> - * As many data cores may want access to available >> - * buffers, they need to be reserved. >> - */ >> - res_base_idx = vq->last_used_idx_res; >> - res_cur_idx = res_base_idx; >> - >> - do { >> - avail_idx = *((volatile uint16_t *)&vq->avail->idx); >> - if (unlikely(res_cur_idx == avail_idx)) { >> - LOG_DEBUG(VHOST_DATA, >> - "(%"PRIu64") Failed " >> - "to get enough desc from " >> - "vring\n", >> - dev->device_fh); >> - return pkt_idx; >> - } else { >> - uint16_t wrapped_idx = >> - (res_cur_idx) & (vq->size - 1); >> - uint32_t idx = >> - vq->avail->ring[wrapped_idx]; >> - uint8_t next_desc; >> - >> - do { >> - next_desc = 0; >> - secure_len += vq->desc[idx].len; >> - if (vq->desc[idx].flags & >> - VRING_DESC_F_NEXT) { >> - idx = vq->desc[idx].next; >> - next_desc = 1; >> - } >> - } while (next_desc); >> - >> - res_cur_idx++; >> - } >> - } while (pkt_len > secure_len); >> - >> - /* vq->last_used_idx_res is atomically updated. */ >> - success = rte_atomic16_cmpset(&vq->last_used_idx_res, >> - res_base_idx, >> - res_cur_idx); >> - } while (success == 0); >> - >> - id = res_base_idx; >> - need_cnt = res_cur_idx - res_base_idx; >> - >> - for (i = 0; i < need_cnt; i++, id++) { >> - uint16_t wrapped_idx = id & (vq->size - 1); >> - uint32_t idx = vq->avail->ring[wrapped_idx]; >> - uint8_t next_desc; >> - do { >> - next_desc = 0; >> - vq->buf_vec[vec_idx].buf_addr = >> - vq->desc[idx].addr; >> - vq->buf_vec[vec_idx].buf_len = >> - vq->desc[idx].len; >> - vq->buf_vec[vec_idx].desc_idx = idx; >> - vec_idx++; >> - >> - if (vq->desc[idx].flags & VRING_DESC_F_NEXT) { >> - idx = vq->desc[idx].next; >> - next_desc = 1; >> - } >> - } while (next_desc); >> - } >> - >> - res_end_idx = res_cur_idx; >> - >> - entry_success = copy_from_mbuf_to_vring(dev, res_base_idx, >> - res_end_idx, pkts[pkt_idx]); >> - >> - rte_compiler_barrier(); >> - >> - /* >> - * Wait until it's our turn to add our buffer >> - * to the used ring. >> - */ >> - while (unlikely(vq->last_used_idx != res_base_idx)) >> - rte_pause(); >> - >> - *(volatile uint16_t *)&vq->used->idx += entry_success; >> - vq->last_used_idx = res_end_idx; >> - >> - /* Kick the guest if necessary. */ >> - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) >> - eventfd_write((int)vq->kickfd, 1); >> - } >> - >> - return count; >> -} >> - >> -uint16_t >> -rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, >> - struct rte_mbuf **pkts, uint16_t count) >> -{ >> - if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))) >> - return virtio_dev_merge_rx(dev, queue_id, pkts, count); >> - else >> - return virtio_dev_rx(dev, queue_id, pkts, count); >> -} >> - >> -uint16_t >> -rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, >> - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) >> -{ >> - struct rte_mbuf *m, *prev; >> + struct rte_mbuf *mbuf; >> struct vhost_virtqueue *vq; >> struct vring_desc *desc; >> - uint64_t vb_addr = 0; >> - uint32_t head[MAX_PKT_BURST]; >> + uint64_t buff_addr = 0; >> + uint32_t head[VHOST_MAX_PKT_BURST]; >> uint32_t used_idx; >> uint32_t i; >> - uint16_t free_entries, entry_success = 0; >> + uint16_t free_entries, packet_success = 0; >> uint16_t avail_idx; >> >> if (unlikely(queue_id != VIRTIO_TXQ)) { >> @@ -549,8 +217,8 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, >> if (vq->last_used_idx == avail_idx) >> return 0; >> >> - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, >> - dev->device_fh); >> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s(%d->%d)\n", >> + dev->device_fh, __func__, vq->last_used_idx, avail_idx); >> >> /* Prefetch available ring to retrieve head indexes. */ >> rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); >> @@ -558,173 +226,68 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, >> /*get the number of free entries in the ring*/ >> free_entries = (avail_idx - vq->last_used_idx); >> >> - free_entries = RTE_MIN(free_entries, count); >> + if (free_entries > count) >> + free_entries = count; >> /* Limit to MAX_PKT_BURST. */ >> - free_entries = RTE_MIN(free_entries, MAX_PKT_BURST); >> + if (free_entries > VHOST_MAX_PKT_BURST) >> + free_entries = VHOST_MAX_PKT_BURST; >> >> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", >> - dev->device_fh, free_entries); >> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries); >> /* Retrieve all of the head indexes first to avoid caching issues. */ >> for (i = 0; i < free_entries; i++) >> head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; >> >> /* Prefetch descriptor index. */ >> - rte_prefetch0(&vq->desc[head[entry_success]]); >> + rte_prefetch0(&vq->desc[head[packet_success]]); >> rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); >> >> - while (entry_success < free_entries) { >> - uint32_t vb_avail, vb_offset; >> - uint32_t seg_avail, seg_offset; >> - uint32_t cpy_len; >> - uint32_t seg_num = 0; >> - struct rte_mbuf *cur; >> - uint8_t alloc_err = 0; >> - >> - desc = &vq->desc[head[entry_success]]; >> + while (packet_success < free_entries) { >> + desc = &vq->desc[head[packet_success]]; >> >> /* Discard first buffer as it is the virtio header */ >> desc = &vq->desc[desc->next]; >> >> /* Buffer address translation. */ >> - vb_addr = gpa_to_vva(dev, desc->addr); >> + buff_addr = gpa_to_vva(dev, desc->addr); >> /* Prefetch buffer address. */ >> - rte_prefetch0((void *)(uintptr_t)vb_addr); >> + rte_prefetch0((void *)(uintptr_t)buff_addr); >> >> used_idx = vq->last_used_idx & (vq->size - 1); >> >> - if (entry_success < (free_entries - 1)) { >> + if (packet_success < (free_entries - 1)) { >> /* Prefetch descriptor index. */ >> - rte_prefetch0(&vq->desc[head[entry_success+1]]); >> + rte_prefetch0(&vq->desc[head[packet_success+1]]); >> rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); >> } >> >> /* Update used index buffer information. */ >> - vq->used->ring[used_idx].id = head[entry_success]; >> + vq->used->ring[used_idx].id = head[packet_success]; >> vq->used->ring[used_idx].len = 0; >> >> - vb_offset = 0; >> - vb_avail = desc->len; >> - /* Allocate an mbuf and populate the structure. */ >> - m = rte_pktmbuf_alloc(mbuf_pool); >> - if (unlikely(m == NULL)) { >> - RTE_LOG(ERR, VHOST_DATA, >> - "Failed to allocate memory for mbuf.\n"); >> - return entry_success; >> + mbuf = rte_pktmbuf_alloc(mbuf_pool); >> + if (unlikely(mbuf == NULL)) { >> + RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); >> + return packet_success; >> } >> - seg_offset = 0; >> - seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; >> - cpy_len = RTE_MIN(vb_avail, seg_avail); >> - >> - PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); >> - >> - seg_num++; >> - cur = m; >> - prev = m; >> - while (cpy_len != 0) { >> - rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset), >> - (void *)((uintptr_t)(vb_addr + vb_offset)), >> - cpy_len); >> - >> - seg_offset += cpy_len; >> - vb_offset += cpy_len; >> - vb_avail -= cpy_len; >> - seg_avail -= cpy_len; >> - >> - if (vb_avail != 0) { >> - /* >> - * The segment reachs to its end, >> - * while the virtio buffer in TX vring has >> - * more data to be copied. >> - */ >> - cur->data_len = seg_offset; >> - m->pkt_len += seg_offset; >> - /* Allocate mbuf and populate the structure. */ >> - cur = rte_pktmbuf_alloc(mbuf_pool); >> - if (unlikely(cur == NULL)) { >> - RTE_LOG(ERR, VHOST_DATA, "Failed to " >> - "allocate memory for mbuf.\n"); >> - rte_pktmbuf_free(m); >> - alloc_err = 1; >> - break; >> - } >> - >> - seg_num++; >> - prev->next = cur; >> - prev = cur; >> - seg_offset = 0; >> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; >> - } else { >> - if (desc->flags & VRING_DESC_F_NEXT) { >> - /* >> - * There are more virtio buffers in >> - * same vring entry need to be copied. >> - */ >> - if (seg_avail == 0) { >> - /* >> - * The current segment hasn't >> - * room to accomodate more >> - * data. >> - */ >> - cur->data_len = seg_offset; >> - m->pkt_len += seg_offset; >> - /* >> - * Allocate an mbuf and >> - * populate the structure. >> - */ >> - cur = rte_pktmbuf_alloc(mbuf_pool); >> - if (unlikely(cur == NULL)) { >> - RTE_LOG(ERR, >> - VHOST_DATA, >> - "Failed to " >> - "allocate memory " >> - "for mbuf\n"); >> - rte_pktmbuf_free(m); >> - alloc_err = 1; >> - break; >> - } >> - seg_num++; >> - prev->next = cur; >> - prev = cur; >> - seg_offset = 0; >> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; >> - } >> - >> - desc = &vq->desc[desc->next]; >> - >> - /* Buffer address translation. */ >> - vb_addr = gpa_to_vva(dev, desc->addr); >> - /* Prefetch buffer address. */ >> - rte_prefetch0((void *)(uintptr_t)vb_addr); >> - vb_offset = 0; >> - vb_avail = desc->len; >> - >> - PRINT_PACKET(dev, (uintptr_t)vb_addr, >> - desc->len, 0); >> - } else { >> - /* The whole packet completes. */ >> - cur->data_len = seg_offset; >> - m->pkt_len += seg_offset; >> - vb_avail = 0; >> - } >> - } >> + mbuf->pkt.data_len = desc->len; >> + mbuf->pkt.pkt_len = mbuf->pkt.data_len; >> >> - cpy_len = RTE_MIN(vb_avail, seg_avail); >> - } >> + rte_memcpy((void *) mbuf->pkt.data, >> + (const void *) buff_addr, mbuf->pkt.data_len); >> >> - if (unlikely(alloc_err == 1)) >> - break; >> + pkts[packet_success] = mbuf; >> >> - m->nb_segs = seg_num; >> + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); >> >> - pkts[entry_success] = m; >> vq->last_used_idx++; >> - entry_success++; >> + packet_success++; >> } >> >> rte_compiler_barrier(); >> - vq->used->idx += entry_success; >> + vq->used->idx += packet_success; >> /* Kick guest if required. */ >> if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) >> eventfd_write((int)vq->kickfd, 1); >> - return entry_success; >> + >> + return packet_success; >> } >> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c >> index 852b6d1..516e743 100644 >> --- a/lib/librte_vhost/virtio-net.c >> +++ b/lib/librte_vhost/virtio-net.c >> @@ -31,17 +31,14 @@ >> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. >> */ >> >> -#include <dirent.h> >> -#include <fuse/cuse_lowlevel.h> >> #include <linux/vhost.h> >> #include <linux/virtio_net.h> >> #include <stddef.h> >> #include <stdint.h> >> #include <stdlib.h> >> -#include <sys/eventfd.h> >> -#include <sys/ioctl.h> >> #include <sys/mman.h> >> #include <unistd.h> >> +#include <assert.h> >> >> #include <rte_ethdev.h> >> #include <rte_log.h> >> @@ -49,10 +46,8 @@ >> #include <rte_memory.h> >> #include <rte_virtio_net.h> >> >> -#include "vhost-net-cdev.h" >> -#include "eventfd_link/eventfd_link.h" >> - >> -/* >> +#include "vhost-net.h" >> +/** >> * Device linked list structure for configuration. >> */ >> struct virtio_net_config_ll { >> @@ -60,38 +55,15 @@ struct virtio_net_config_ll { >> struct virtio_net_config_ll *next; /* Next dev on linked list.*/ >> }; >> >> -const char eventfd_cdev[] = "/dev/eventfd-link"; >> - >> -/* device ops to add/remove device to/from data core. */ >> +/* device ops to add/remove device to data core. */ >> static struct virtio_net_device_ops const *notify_ops; >> -/* root address of the linked list of managed virtio devices */ >> +/* root address of the linked list in the configuration core. */ >> static struct virtio_net_config_ll *ll_root; >> >> /* Features supported by this lib. */ >> -#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ >> - (1ULL << VIRTIO_NET_F_CTRL_RX)) >> +#define VHOST_SUPPORTED_FEATURES (1ULL << VIRTIO_NET_F_MRG_RXBUF) >> static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; >> >> -/* Line size for reading maps file. */ >> -static const uint32_t BUFSIZE = PATH_MAX; >> - >> -/* Size of prot char array in procmap. */ >> -#define PROT_SZ 5 >> - >> -/* Number of elements in procmap struct. */ >> -#define PROCMAP_SZ 8 >> - >> -/* Structure containing information gathered from maps file. */ >> -struct procmap { >> - uint64_t va_start; /* Start virtual address in file. */ >> - uint64_t len; /* Size of file. */ >> - uint64_t pgoff; /* Not used. */ >> - uint32_t maj; /* Not used. */ >> - uint32_t min; /* Not used. */ >> - uint32_t ino; /* Not used. */ >> - char prot[PROT_SZ]; /* Not used. */ >> - char fname[PATH_MAX]; /* File name. */ >> -}; >> >> /* >> * Converts QEMU virtual address to Vhost virtual address. This function is >> @@ -110,199 +82,15 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) >> if ((qemu_va >= region->userspace_address) && >> (qemu_va <= region->userspace_address + >> region->memory_size)) { >> - vhost_va = dev->mem->mapped_address + qemu_va - >> - dev->mem->base_address; >> + vhost_va = qemu_va + region->guest_phys_address + >> + region->address_offset - >> + region->userspace_address; >> break; >> } >> } >> return vhost_va; >> } >> >> -/* >> - * Locate the file containing QEMU's memory space and >> - * map it to our address space. >> - */ >> -static int >> -host_memory_map(struct virtio_net *dev, struct virtio_memory *mem, >> - pid_t pid, uint64_t addr) >> -{ >> - struct dirent *dptr = NULL; >> - struct procmap procmap; >> - DIR *dp = NULL; >> - int fd; >> - int i; >> - char memfile[PATH_MAX]; >> - char mapfile[PATH_MAX]; >> - char procdir[PATH_MAX]; >> - char resolved_path[PATH_MAX]; >> - char *path = NULL; >> - FILE *fmap; >> - void *map; >> - uint8_t found = 0; >> - char line[BUFSIZE]; >> - char dlm[] = "- : "; >> - char *str, *sp, *in[PROCMAP_SZ]; >> - char *end = NULL; >> - >> - /* Path where mem files are located. */ >> - snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); >> - /* Maps file used to locate mem file. */ >> - snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); >> - >> - fmap = fopen(mapfile, "r"); >> - if (fmap == NULL) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") Failed to open maps file for pid %d\n", >> - dev->device_fh, pid); >> - return -1; >> - } >> - >> - /* Read through maps file until we find out base_address. */ >> - while (fgets(line, BUFSIZE, fmap) != 0) { >> - str = line; >> - errno = 0; >> - /* Split line into fields. */ >> - for (i = 0; i < PROCMAP_SZ; i++) { >> - in[i] = strtok_r(str, &dlm[i], &sp); >> - if ((in[i] == NULL) || (errno != 0)) { >> - fclose(fmap); >> - return -1; >> - } >> - str = NULL; >> - } >> - >> - /* Convert/Copy each field as needed. */ >> - procmap.va_start = strtoull(in[0], &end, 16); >> - if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || >> - (errno != 0)) { >> - fclose(fmap); >> - return -1; >> - } >> - >> - procmap.len = strtoull(in[1], &end, 16); >> - if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || >> - (errno != 0)) { >> - fclose(fmap); >> - return -1; >> - } >> - >> - procmap.pgoff = strtoull(in[3], &end, 16); >> - if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || >> - (errno != 0)) { >> - fclose(fmap); >> - return -1; >> - } >> - >> - procmap.maj = strtoul(in[4], &end, 16); >> - if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || >> - (errno != 0)) { >> - fclose(fmap); >> - return -1; >> - } >> - >> - procmap.min = strtoul(in[5], &end, 16); >> - if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || >> - (errno != 0)) { >> - fclose(fmap); >> - return -1; >> - } >> - >> - procmap.ino = strtoul(in[6], &end, 16); >> - if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || >> - (errno != 0)) { >> - fclose(fmap); >> - return -1; >> - } >> - >> - memcpy(&procmap.prot, in[2], PROT_SZ); >> - memcpy(&procmap.fname, in[7], PATH_MAX); >> - >> - if (procmap.va_start == addr) { >> - procmap.len = procmap.len - procmap.va_start; >> - found = 1; >> - break; >> - } >> - } >> - fclose(fmap); >> - >> - if (!found) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") Failed to find memory file in pid %d maps file\n", >> - dev->device_fh, pid); >> - return -1; >> - } >> - >> - /* Find the guest memory file among the process fds. */ >> - dp = opendir(procdir); >> - if (dp == NULL) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") Cannot open pid %d process directory\n", >> - dev->device_fh, pid); >> - return -1; >> - } >> - >> - found = 0; >> - >> - /* Read the fd directory contents. */ >> - while (NULL != (dptr = readdir(dp))) { >> - snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", >> - pid, dptr->d_name); >> - path = realpath(memfile, resolved_path); >> - if ((path == NULL) && (strlen(resolved_path) == 0)) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") Failed to resolve fd directory\n", >> - dev->device_fh); >> - closedir(dp); >> - return -1; >> - } >> - if (strncmp(resolved_path, procmap.fname, >> - strnlen(procmap.fname, PATH_MAX)) == 0) { >> - found = 1; >> - break; >> - } >> - } >> - >> - closedir(dp); >> - >> - if (found == 0) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") Failed to find memory file for pid %d\n", >> - dev->device_fh, pid); >> - return -1; >> - } >> - /* Open the shared memory file and map the memory into this process. */ >> - fd = open(memfile, O_RDWR); >> - >> - if (fd == -1) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") Failed to open %s for pid %d\n", >> - dev->device_fh, memfile, pid); >> - return -1; >> - } >> - >> - map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE, >> - MAP_POPULATE|MAP_SHARED, fd, 0); >> - close(fd); >> - >> - if (map == MAP_FAILED) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") Error mapping the file %s for pid %d\n", >> - dev->device_fh, memfile, pid); >> - return -1; >> - } >> - >> - /* Store the memory address and size in the device data structure */ >> - mem->mapped_address = (uint64_t)(uintptr_t)map; >> - mem->mapped_size = procmap.len; >> - >> - LOG_DEBUG(VHOST_CONFIG, >> - "(%"PRIu64") Mem File: %s->%s - Size: %llu - VA: %p\n", >> - dev->device_fh, >> - memfile, resolved_path, >> - (unsigned long long)mem->mapped_size, map); >> - >> - return 0; >> -} >> >> /* >> * Retrieves an entry from the devices configuration linked list. >> @@ -376,7 +164,7 @@ add_config_ll_entry(struct virtio_net_config_ll *new_ll_dev) >> } >> >> } >> - >> +/*TODO dpdk alloc/free if possible */ >> /* >> * Unmap any memory, close any file descriptors and >> * free any memory owned by a device. >> @@ -389,16 +177,17 @@ cleanup_device(struct virtio_net *dev) >> munmap((void *)(uintptr_t)dev->mem->mapped_address, >> (size_t)dev->mem->mapped_size); >> free(dev->mem); >> + dev->mem = NULL; >> } >> >> /* Close any event notifiers opened by device. */ >> - if (dev->virtqueue[VIRTIO_RXQ]->callfd) >> + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) >> close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); >> - if (dev->virtqueue[VIRTIO_RXQ]->kickfd) >> + if (dev->virtqueue[VIRTIO_RXQ]->kickfd > 0) >> close((int)dev->virtqueue[VIRTIO_RXQ]->kickfd); >> - if (dev->virtqueue[VIRTIO_TXQ]->callfd) >> + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) >> close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); >> - if (dev->virtqueue[VIRTIO_TXQ]->kickfd) >> + if (dev->virtqueue[VIRTIO_TXQ]->kickfd > 0) >> close((int)dev->virtqueue[VIRTIO_TXQ]->kickfd); >> } >> >> @@ -522,8 +311,8 @@ new_device(struct vhost_device_ctx ctx) >> } >> >> /* >> - * Function is called from the CUSE release function. This function will >> - * cleanup the device and remove it from device configuration linked list. >> + * Function is called from the CUSE release function. This function will cleanup >> + * the device and remove it from device configuration linked list. >> */ >> static void >> destroy_device(struct vhost_device_ctx ctx) >> @@ -569,6 +358,7 @@ set_owner(struct vhost_device_ctx ctx) >> return -1; >> >> return 0; >> + /* TODO check ctx.fh is meaningfull here */ >> } >> >> /* >> @@ -651,14 +441,12 @@ set_features(struct vhost_device_ctx ctx, uint64_t *pu) >> * This includes storing offsets used to translate buffer addresses. >> */ >> static int >> -set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, >> - uint32_t nregions) >> +set_mem_table(struct vhost_device_ctx ctx, >> + const struct virtio_memory_regions *regions, uint32_t nregions) >> { >> struct virtio_net *dev; >> - struct vhost_memory_region *mem_regions; >> struct virtio_memory *mem; >> - uint64_t size = offsetof(struct vhost_memory, regions); >> - uint32_t regionidx, valid_regions; >> + uint32_t regionidx; >> >> dev = get_device(ctx); >> if (dev == NULL) >> @@ -682,107 +470,24 @@ set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, >> >> mem->nregions = nregions; >> >> - mem_regions = (void *)(uintptr_t) >> - ((uint64_t)(uintptr_t)mem_regions_addr + size); >> - >> for (regionidx = 0; regionidx < mem->nregions; regionidx++) { >> /* Populate the region structure for each region. */ >> - mem->regions[regionidx].guest_phys_address = >> - mem_regions[regionidx].guest_phys_addr; >> - mem->regions[regionidx].guest_phys_address_end = >> - mem->regions[regionidx].guest_phys_address + >> - mem_regions[regionidx].memory_size; >> - mem->regions[regionidx].memory_size = >> - mem_regions[regionidx].memory_size; >> - mem->regions[regionidx].userspace_address = >> - mem_regions[regionidx].userspace_addr; >> - >> - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", dev->device_fh, >> - regionidx, >> - (void *)(uintptr_t)mem->regions[regionidx].guest_phys_address, >> - (void *)(uintptr_t)mem->regions[regionidx].userspace_address, >> - mem->regions[regionidx].memory_size); >> - >> - /*set the base address mapping*/ >> + mem->regions[regionidx] = regions[regionidx]; >> if (mem->regions[regionidx].guest_phys_address == 0x0) { >> mem->base_address = >> mem->regions[regionidx].userspace_address; >> - /* Map VM memory file */ >> - if (host_memory_map(dev, mem, ctx.pid, >> - mem->base_address) != 0) { >> - free(mem); >> - return -1; >> - } >> + mem->mapped_address = >> + mem->regions[regionidx].address_offset; >> } >> } >> >> - /* Check that we have a valid base address. */ >> - if (mem->base_address == 0) { >> - RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Failed to find base address of qemu memory file.\n", dev->device_fh); >> - free(mem); >> - return -1; >> - } >> - >> - /* >> - * Check if all of our regions have valid mappings. >> - * Usually one does not exist in the QEMU memory file. >> - */ >> - valid_regions = mem->nregions; >> - for (regionidx = 0; regionidx < mem->nregions; regionidx++) { >> - if ((mem->regions[regionidx].userspace_address < >> - mem->base_address) || >> - (mem->regions[regionidx].userspace_address > >> - (mem->base_address + mem->mapped_size))) >> - valid_regions--; >> - } >> - >> - /* >> - * If a region does not have a valid mapping, >> - * we rebuild our memory struct to contain only valid entries. >> - */ >> - if (valid_regions != mem->nregions) { >> - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") Not all memory regions exist in the QEMU mem file. Re-populating mem structure\n", >> - dev->device_fh); >> - >> - /* >> - * Re-populate the memory structure with only valid regions. >> - * Invalid regions are over-written with memmove. >> - */ >> - valid_regions = 0; >> - >> - for (regionidx = mem->nregions; 0 != regionidx--;) { >> - if ((mem->regions[regionidx].userspace_address < >> - mem->base_address) || >> - (mem->regions[regionidx].userspace_address > >> - (mem->base_address + mem->mapped_size))) { >> - memmove(&mem->regions[regionidx], >> - &mem->regions[regionidx + 1], >> - sizeof(struct virtio_memory_regions) * >> - valid_regions); >> - } else { >> - valid_regions++; >> - } >> - } >> - } >> - mem->nregions = valid_regions; >> + /*TODO addback the logic that remove invalid memory regions */ >> dev->mem = mem; >> >> - /* >> - * Calculate the address offset for each region. >> - * This offset is used to identify the vhost virtual address >> - * corresponding to a QEMU guest physical address. >> - */ >> - for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { >> - dev->mem->regions[regionidx].address_offset = >> - dev->mem->regions[regionidx].userspace_address - >> - dev->mem->base_address + >> - dev->mem->mapped_address - >> - dev->mem->regions[regionidx].guest_phys_address; >> - >> - } >> return 0; >> } >> >> + >> /* >> * Called from CUSE IOCTL: VHOST_SET_VRING_NUM >> * The virtio device sends us the size of the descriptor ring. >> @@ -896,38 +601,62 @@ get_vring_base(struct vhost_device_ctx ctx, uint32_t index, >> /* State->index refers to the queue index. The txq is 1, rxq is 0. */ >> state->num = dev->virtqueue[state->index]->last_used_idx; >> >> - return 0; >> -} >> + if (dev->flags & VIRTIO_DEV_RUNNING) { >> + RTE_LOG(INFO, VHOST_CONFIG, >> + "get_vring_base message is for release\n"); >> + notify_ops->destroy_device(dev); >> + /* >> + * sync call. >> + * when it returns, it means it si removed from data core. >> + */ >> + } >> + /* TODO fix all munmap */ >> + if (dev->mem) { >> + munmap((void *)(uintptr_t)dev->mem->mapped_address, >> + (size_t)dev->mem->mapped_size); >> + free(dev->mem); >> + dev->mem = NULL; >> + } >> >> -/* >> - * This function uses the eventfd_link kernel module to copy an eventfd file >> - * descriptor provided by QEMU in to our process space. >> - */ >> -static int >> -eventfd_copy(struct virtio_net *dev, struct eventfd_copy *eventfd_copy) >> -{ >> - int eventfd_link, ret; >> >> - /* Open the character device to the kernel module. */ >> - eventfd_link = open(eventfd_cdev, O_RDWR); >> - if (eventfd_link < 0) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") eventfd_link module is not loaded\n", >> - dev->device_fh); >> - return -1; >> - } >> + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) >> + close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); >> + dev->virtqueue[VIRTIO_RXQ]->callfd = -1; >> + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) >> + close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); >> + dev->virtqueue[VIRTIO_TXQ]->callfd = -1; >> + /* We don't cleanup callfd here as we willn't get CALLFD again */ >> + >> + dev->virtqueue[VIRTIO_RXQ]->desc = NULL; >> + dev->virtqueue[VIRTIO_RXQ]->avail = NULL; >> + dev->virtqueue[VIRTIO_RXQ]->used = NULL; >> + dev->virtqueue[VIRTIO_RXQ]->last_used_idx = 0; >> + dev->virtqueue[VIRTIO_RXQ]->last_used_idx_res = 0; >> + >> + dev->virtqueue[VIRTIO_TXQ]->desc = NULL; >> + dev->virtqueue[VIRTIO_TXQ]->avail = NULL; >> + dev->virtqueue[VIRTIO_TXQ]->used = NULL; >> + dev->virtqueue[VIRTIO_TXQ]->last_used_idx = 0; >> + dev->virtqueue[VIRTIO_TXQ]->last_used_idx_res = 0; >> >> - /* Call the IOCTL to copy the eventfd. */ >> - ret = ioctl(eventfd_link, EVENTFD_COPY, eventfd_copy); >> - close(eventfd_link); >> >> - if (ret < 0) { >> - RTE_LOG(ERR, VHOST_CONFIG, >> - "(%"PRIu64") EVENTFD_COPY ioctl failed\n", >> - dev->device_fh); >> - return -1; >> - } >> + return 0; >> +} >> >> +static int >> +virtio_is_ready(struct virtio_net *dev, int index) >> +{ >> + struct vhost_virtqueue *vq1, *vq2; >> + /* mq support in future.*/ >> + vq1 = dev->virtqueue[index]; >> + vq2 = dev->virtqueue[index ^ 1]; >> + if (vq1 && vq2 && vq1->desc && vq2->desc && >> + (vq1->kickfd > 0) && (vq1->callfd > 0) && >> + (vq2->kickfd > 0) && (vq2->callfd > 0)) { >> + LOG_DEBUG(VHOST_CONFIG, "virtio is ready for processing.\n"); >> + return 1; >> + } >> + LOG_DEBUG(VHOST_CONFIG, "virtio isn't ready for processing.\n"); >> return 0; >> } >> >> @@ -940,7 +669,6 @@ static int >> set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) >> { >> struct virtio_net *dev; >> - struct eventfd_copy eventfd_kick; >> struct vhost_virtqueue *vq; >> >> dev = get_device(ctx); >> @@ -953,14 +681,7 @@ set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) >> if (vq->kickfd) >> close((int)vq->kickfd); >> >> - /* Populate the eventfd_copy structure and call eventfd_copy. */ >> - vq->kickfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); >> - eventfd_kick.source_fd = vq->kickfd; >> - eventfd_kick.target_fd = file->fd; >> - eventfd_kick.target_pid = ctx.pid; >> - >> - if (eventfd_copy(dev, &eventfd_kick)) >> - return -1; >> + vq->kickfd = file->fd; >> >> return 0; >> } >> @@ -974,7 +695,6 @@ static int >> set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) >> { >> struct virtio_net *dev; >> - struct eventfd_copy eventfd_call; >> struct vhost_virtqueue *vq; >> >> dev = get_device(ctx); >> @@ -986,16 +706,11 @@ set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) >> >> if (vq->callfd) >> close((int)vq->callfd); >> + vq->callfd = file->fd; >> >> - /* Populate the eventfd_copy structure and call eventfd_copy. */ >> - vq->callfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); >> - eventfd_call.source_fd = vq->callfd; >> - eventfd_call.target_fd = file->fd; >> - eventfd_call.target_pid = ctx.pid; >> - >> - if (eventfd_copy(dev, &eventfd_call)) >> - return -1; >> - >> + if (virtio_is_ready(dev, file->index) && >> + !(dev->flags & VIRTIO_DEV_RUNNING)) >> + notify_ops->new_device(dev); >> return 0; >> } >> >> @@ -1024,6 +739,7 @@ set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file) >> * If the device isn't already running and both backend fds are set, >> * we add the device. >> */ >> + LOG_DEBUG(VHOST_CONFIG, "%s %d\n", __func__, file->fd); >> if (!(dev->flags & VIRTIO_DEV_RUNNING)) { >> if (((int)dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED) && >> ((int)dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED)) ^ permalink raw reply [flat|nested] 6+ messages in thread
* [dpdk-dev] [RFC PATCH] lib/librte_vhost: cleanup white spaces, tabs and indents 2014-11-15 1:14 [dpdk-dev] [PATCH RFC] lib/librte_vhost: vhost-user Huawei Xie 2014-11-17 6:04 ` Tetsuya Mukawa @ 2014-11-17 6:06 ` Tetsuya Mukawa 2014-11-17 6:07 ` [dpdk-dev] [RFC PATCH 1/2] lib/librte_vhost: change macro name of include guard Tetsuya Mukawa 2 siblings, 0 replies; 6+ messages in thread From: Tetsuya Mukawa @ 2014-11-17 6:06 UTC (permalink / raw) To: dev --- lib/librte_vhost/rte_virtio_net.h | 4 +-- lib/librte_vhost/vhost-cuse/vhost-net-cdev.c | 4 +-- lib/librte_vhost/vhost-cuse/virtio-net-cdev.c | 8 ++--- lib/librte_vhost/vhost-user/fd_man.c | 13 ++++---- lib/librte_vhost/vhost-user/fd_man.h | 2 +- lib/librte_vhost/vhost-user/vhost-net-user.c | 37 +++++++++++----------- lib/librte_vhost/vhost-user/virtio-net-user.c | 44 +++++++++++++-------------- lib/librte_vhost/vhost_rxtx.c | 2 +- lib/librte_vhost/virtio-net.c | 10 +++--- 9 files changed, 61 insertions(+), 63 deletions(-) diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 7a05dab..7d7d001 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -140,12 +140,12 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) } /** - * Disable features in feature_mask. Returns 0 on success. + * Disable features in feature_mask. Returns 0 on success. */ int rte_vhost_feature_disable(uint64_t feature_mask); /** - * Enable features in feature_mask. Returns 0 on success. + * Enable features in feature_mask. Returns 0 on success. */ int rte_vhost_feature_enable(uint64_t feature_mask); diff --git a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c index 4671643..688ec00 100644 --- a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c +++ b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c @@ -329,7 +329,7 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, } else { int fd; file = *(const struct vhost_vring_file *)in_buf; - LOG_DEBUG(VHOST_CONFIG, + LOG_DEBUG(VHOST_CONFIG, "kick/call idx:%d fd:%d\n", file.index, file.fd); if ((fd = eventfd_copy(file.fd, ctx.pid)) < 0){ fuse_reply_ioctl(req, -1, NULL, 0); @@ -338,7 +338,7 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, if (cmd == VHOST_SET_VRING_KICK) { VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_call); } - else { + else { VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_kick); } } diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c index 5c16aa5..7381140 100644 --- a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c @@ -288,7 +288,7 @@ cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_r base_address = regions[idx].userspace_address; /* Map VM memory file */ - if (host_memory_map(ctx.pid, base_address, + if (host_memory_map(ctx.pid, base_address, &mapped_address, &mapped_size) != 0) { return -1; } @@ -297,18 +297,18 @@ cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_r /* Check that we have a valid base address. */ if (base_address == 0) { - RTE_LOG(ERR, VHOST_CONFIG, + RTE_LOG(ERR, VHOST_CONFIG, "Failed to find base address of qemu memory file.\n"); return -1; } for (idx = 0; idx < nregions; idx++) { - regions[idx].address_offset = + regions[idx].address_offset = mapped_address - base_address + regions[idx].userspace_address - regions[idx].guest_phys_address; } - + ops->set_mem_table(ctx, ®ions[0], nregions); return 0; } diff --git a/lib/librte_vhost/vhost-user/fd_man.c b/lib/librte_vhost/vhost-user/fd_man.c index c7fd3f2..cbc656b 100644 --- a/lib/librte_vhost/vhost-user/fd_man.c +++ b/lib/librte_vhost/vhost-user/fd_man.c @@ -15,7 +15,7 @@ * Returns the index in the fdset for a fd. * If fd is -1, it means to search for a free entry. * @return - * Index for the fd, or -1 if fd isn't in the fdset. + * Index for the fd, or -1 if fd isn't in the fdset. */ static int fdset_find_fd(struct fdset *pfdset, int fd) @@ -23,8 +23,8 @@ fdset_find_fd(struct fdset *pfdset, int fd) int i; for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++); - - return i == pfdset->num ? -1 : i; + + return i == pfdset->num ? -1 : i; } static int @@ -35,7 +35,7 @@ fdset_find_free_slot(struct fdset *pfdset) } static void -fdset_add_fd(struct fdset *pfdset, int idx, int fd, fd_cb rcb, +fdset_add_fd(struct fdset *pfdset, int idx, int fd, fd_cb rcb, fd_cb wcb, uint64_t dat) { struct fdentry *pfdentry = &pfdset->fd[idx]; @@ -111,7 +111,7 @@ fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, uint64_t dat) } /** - * Unregister the fd from the fdset. + * Unregister the fd from the fdset. */ void fdset_del(struct fdset *pfdset, int fd) @@ -148,11 +148,10 @@ fdset_event_dispatch(struct fdset *pfdset) for (i = 0; i < num; i++) { pfdentry = &pfdset->fd[i]; - if (FD_ISSET(pfdentry->fd, &rfds)) + if (FD_ISSET(pfdentry->fd, &rfds)) pfdentry->rcb(pfdentry->fd, pfdentry->dat); if (FD_ISSET(pfdentry->fd, &wfds)) pfdentry->wcb(pfdentry->fd, pfdentry->dat); } - } } diff --git a/lib/librte_vhost/vhost-user/fd_man.h b/lib/librte_vhost/vhost-user/fd_man.h index 57cc81d..8df17b4 100644 --- a/lib/librte_vhost/vhost-user/fd_man.h +++ b/lib/librte_vhost/vhost-user/fd_man.h @@ -15,7 +15,7 @@ struct fdentry { struct fdset { struct fdentry fd[MAX_FDS]; - int num; + int num; }; diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.c b/lib/librte_vhost/vhost-user/vhost-net-user.c index 34450f4..0b100ba 100644 --- a/lib/librte_vhost/vhost-user/vhost-net-user.c +++ b/lib/librte_vhost/vhost-user/vhost-net-user.c @@ -106,7 +106,7 @@ uds_socket(const char *path) ret = listen(sockfd, 1); if (ret == -1) goto err; - + return sockfd; err: @@ -129,7 +129,7 @@ read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) iov.iov_base = buf; iov.iov_len = buflen; - + msgh.msg_iov = &iov; msgh.msg_iovlen = 1; msgh.msg_control = control; @@ -148,7 +148,7 @@ read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; cmsg = CMSG_NXTHDR(&msgh, cmsg)) { - if ( (cmsg->cmsg_level == SOL_SOCKET) && + if ( (cmsg->cmsg_level == SOL_SOCKET) && (cmsg->cmsg_type == SCM_RIGHTS)) { memcpy(fds, CMSG_DATA(cmsg), fdsize); break; @@ -162,14 +162,14 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg) { int ret; - ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, msg->fds, VHOST_MEMORY_MAX_NREGIONS); if (ret <= 0) return ret; if (msg->size) { if (msg->size > sizeof(msg->payload)) { - RTE_LOG(ERR, VHOST_CONFIG, + RTE_LOG(ERR, VHOST_CONFIG, "%s: invalid size:%d\n", __func__, msg->size); return -1; } @@ -182,7 +182,7 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg) } } - return ret; + return ret; } static int @@ -200,7 +200,7 @@ send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) iov.iov_len = buflen; msgh.msg_iov = &iov; msgh.msg_iovlen = 1; - + if (fds && fd_num > 0) { msgh.msg_control = control; msgh.msg_controllen = sizeof(control); @@ -222,7 +222,7 @@ send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); return -1; } - + return 0; } @@ -233,15 +233,15 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg) msg->flags &= ~VHOST_USER_VERSION_MASK; msg->flags |= VHOST_USER_VERSION; - msg->flags |= VHOST_USER_REPLY_MASK; + msg->flags |= VHOST_USER_REPLY_MASK; - ret = send_fd_message(sockfd, (char *)msg, + ret = send_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE + msg->size, NULL, 0); - + return ret; } -/* call back when there is new connection. */ +/* call back when there is new connection. */ static void vserver_new_vq_conn(int fd, uint64_t dat) { @@ -251,7 +251,7 @@ vserver_new_vq_conn(int fd, uint64_t dat) struct vhost_device_ctx vdev_ctx = { 0 }; conn_fd = accept(fd, NULL, NULL); - RTE_LOG(INFO, VHOST_CONFIG, + RTE_LOG(INFO, VHOST_CONFIG, "%s: new connection is %d\n", __func__, conn_fd); if (conn_fd < 0) return; @@ -259,8 +259,8 @@ vserver_new_vq_conn(int fd, uint64_t dat) fh = ops->new_device(vdev_ctx); RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh); - fdset_add(&vserver->fdset, - conn_fd, vserver_message_handler, NULL, fh); + fdset_add(&vserver->fdset, + conn_fd, vserver_message_handler, NULL, fh); } /* callback when there is message on the connfd */ @@ -277,7 +277,7 @@ vserver_message_handler(int connfd, uint64_t dat) ret = read_vhost_message(connfd, &msg); if (ret < 0) { printf("vhost read message failed\n"); - + /*TODO: cleanup */ close(connfd); fdset_del(&g_vhost_server->fdset, connfd); @@ -286,7 +286,7 @@ vserver_message_handler(int connfd, uint64_t dat) return; } else if (ret == 0) { /*TODO: cleanup */ - RTE_LOG(INFO, VHOST_CONFIG, + RTE_LOG(INFO, VHOST_CONFIG, "vhost peer closed\n"); close(connfd); fdset_del(&g_vhost_server->fdset, connfd); @@ -296,7 +296,7 @@ vserver_message_handler(int connfd, uint64_t dat) } if (msg.request > VHOST_USER_MAX) { /*TODO: cleanup */ - RTE_LOG(INFO, VHOST_CONFIG, + RTE_LOG(INFO, VHOST_CONFIG, "vhost read incorrect message\n"); close(connfd); fdset_del(&g_vhost_server->fdset, connfd); @@ -363,7 +363,6 @@ vserver_message_handler(int connfd, uint64_t dat) default: break; - } } diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.c b/lib/librte_vhost/vhost-user/virtio-net-user.c index f38e6cc..4103977 100644 --- a/lib/librte_vhost/vhost-user/virtio-net-user.c +++ b/lib/librte_vhost/vhost-user/virtio-net-user.c @@ -65,7 +65,7 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) } for (idx = 0; idx < memory.nregions; idx++) { - uint64_t size = memory.regions[idx].userspace_addr - + uint64_t size = memory.regions[idx].userspace_addr - base_address + memory.regions[idx].memory_size; if (mem_size < size) mem_size = size; @@ -75,28 +75,28 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) * here we assume qemu will map only one file for memory allocation, * we only use fds[0] with offset 0. */ - mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mem_size, + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, pmsg->fds[0], 0); if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); return -1; } - + for (idx = 0; idx < memory.nregions; idx++) { - regions[idx].guest_phys_address = + regions[idx].guest_phys_address = memory.regions[idx].guest_phys_addr; - regions[idx].guest_phys_address_end = + regions[idx].guest_phys_address_end = memory.regions[idx].guest_phys_addr + memory.regions[idx].memory_size; regions[idx].memory_size = memory.regions[idx].memory_size; - regions[idx].userspace_address = + regions[idx].userspace_address = memory.regions[idx].userspace_addr; - regions[idx].address_offset = mapped_address - base_address + + regions[idx].address_offset = mapped_address - base_address + regions[idx].userspace_address - regions[idx].guest_phys_address; - LOG_DEBUG(VHOST_CONFIG, + LOG_DEBUG(VHOST_CONFIG, "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", idx, (void *)(uintptr_t)regions[idx].guest_phys_address, @@ -129,28 +129,28 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) for (idx = 0; idx < memory.nregions; idx++) { - regions[idx].guest_phys_address = + regions[idx].guest_phys_address = memory.regions[idx].guest_phys_addr; - regions[idx].guest_phys_address_end = + regions[idx].guest_phys_address_end = memory.regions[idx].guest_phys_addr + memory.regions[idx].memory_size; regions[idx].memory_size = memory.regions[idx].memory_size; - regions[idx].userspace_address = + regions[idx].userspace_address = memory.regions[idx].userspace_addr; /* - mapped_address = (uint64_t)(uintptr_t)mmap(NULL, - regions[idx].memory_size, - PROT_READ | PROT_WRITE, MAP_SHARED, - pmsg->fds[idx], + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, + regions[idx].memory_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + pmsg->fds[idx], memory.regions[idx].mmap_offset); */ /* This is ugly */ - mapped_address = (uint64_t)(uintptr_t)mmap(NULL, + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, regions[idx].memory_size + - memory.regions[idx].mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED, - pmsg->fds[idx], + memory.regions[idx].mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, + pmsg->fds[idx], 0); printf("mapped to %p\n", (void *)mapped_address); @@ -165,7 +165,7 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) regions[idx].address_offset = mapped_address - regions[idx].guest_phys_address; - LOG_DEBUG(VHOST_CONFIG, + LOG_DEBUG(VHOST_CONFIG, "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", idx, (void *)(uintptr_t)regions[idx].guest_phys_address, @@ -189,7 +189,7 @@ user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; file.fd = pmsg->fds[0]; - RTE_LOG(INFO, VHOST_CONFIG, + RTE_LOG(INFO, VHOST_CONFIG, "vring call idx:%d file:%d\n", file.index, file.fd); ops->set_vring_call(ctx, &file); } @@ -202,7 +202,7 @@ user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; file.fd = pmsg->fds[0]; - RTE_LOG(INFO, VHOST_CONFIG, + RTE_LOG(INFO, VHOST_CONFIG, "vring kick idx:%d file:%d\n", file.index, file.fd); ops->set_vring_kick(ctx, &file); } diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index 8ff0301..3a33eb0 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -217,7 +217,7 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_me if (vq->last_used_idx == avail_idx) return 0; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s(%d->%d)\n", + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s(%d->%d)\n", dev->device_fh, __func__, vq->last_used_idx, avail_idx); /* Prefetch available ring to retrieve head indexes. */ diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c index 516e743..30661e3 100644 --- a/lib/librte_vhost/virtio-net.c +++ b/lib/librte_vhost/virtio-net.c @@ -82,7 +82,7 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) if ((qemu_va >= region->userspace_address) && (qemu_va <= region->userspace_address + region->memory_size)) { - vhost_va = qemu_va + region->guest_phys_address + + vhost_va = qemu_va + region->guest_phys_address + region->address_offset - region->userspace_address; break; @@ -476,7 +476,7 @@ set_mem_table(struct vhost_device_ctx ctx, if (mem->regions[regionidx].guest_phys_address == 0x0) { mem->base_address = mem->regions[regionidx].userspace_address; - mem->mapped_address = + mem->mapped_address = mem->regions[regionidx].address_offset; } } @@ -602,7 +602,7 @@ get_vring_base(struct vhost_device_ctx ctx, uint32_t index, state->num = dev->virtqueue[state->index]->last_used_idx; if (dev->flags & VIRTIO_DEV_RUNNING) { - RTE_LOG(INFO, VHOST_CONFIG, + RTE_LOG(INFO, VHOST_CONFIG, "get_vring_base message is for release\n"); notify_ops->destroy_device(dev); /* @@ -626,7 +626,7 @@ get_vring_base(struct vhost_device_ctx ctx, uint32_t index, close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); dev->virtqueue[VIRTIO_TXQ]->callfd = -1; /* We don't cleanup callfd here as we willn't get CALLFD again */ - + dev->virtqueue[VIRTIO_RXQ]->desc = NULL; dev->virtqueue[VIRTIO_RXQ]->avail = NULL; dev->virtqueue[VIRTIO_RXQ]->used = NULL; @@ -650,7 +650,7 @@ virtio_is_ready(struct virtio_net *dev, int index) /* mq support in future.*/ vq1 = dev->virtqueue[index]; vq2 = dev->virtqueue[index ^ 1]; - if (vq1 && vq2 && vq1->desc && vq2->desc && + if (vq1 && vq2 && vq1->desc && vq2->desc && (vq1->kickfd > 0) && (vq1->callfd > 0) && (vq2->kickfd > 0) && (vq2->callfd > 0)) { LOG_DEBUG(VHOST_CONFIG, "virtio is ready for processing.\n"); -- 1.9.1 ^ permalink raw reply [flat|nested] 6+ messages in thread
* [dpdk-dev] [RFC PATCH 1/2] lib/librte_vhost: change macro name of include guard. 2014-11-15 1:14 [dpdk-dev] [PATCH RFC] lib/librte_vhost: vhost-user Huawei Xie 2014-11-17 6:04 ` Tetsuya Mukawa 2014-11-17 6:06 ` [dpdk-dev] [RFC PATCH] lib/librte_vhost: cleanup white spaces, tabs and indents Tetsuya Mukawa @ 2014-11-17 6:07 ` Tetsuya Mukawa 2014-11-17 6:07 ` [dpdk-dev] [RFC PATCH 2/2] lib/librte_vhost: Add device abstraction layer Tetsuya Mukawa 2 siblings, 1 reply; 6+ messages in thread From: Tetsuya Mukawa @ 2014-11-17 6:07 UTC (permalink / raw) To: dev This patch changes include macro name like following. - "_VIRTIO_NET_H_" > "_RTE_VIRTIO_NET_H_" --- lib/librte_vhost/rte_virtio_net.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 7d7d001..a09533d 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -31,8 +31,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _VIRTIO_NET_H_ -#define _VIRTIO_NET_H_ +#ifndef _RTE_VIRTIO_NET_H_ +#define _RTE_VIRTIO_NET_H_ #include <stdint.h> #include <linux/virtio_ring.h> @@ -189,4 +189,4 @@ uint32_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, uint32_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count); -#endif /* _VIRTIO_NET_H_ */ +#endif /* _RTE_VIRTIO_NET_H_ */ -- 1.9.1 ^ permalink raw reply [flat|nested] 6+ messages in thread
* [dpdk-dev] [RFC PATCH 2/2] lib/librte_vhost: Add device abstraction layer 2014-11-17 6:07 ` [dpdk-dev] [RFC PATCH 1/2] lib/librte_vhost: change macro name of include guard Tetsuya Mukawa @ 2014-11-17 6:07 ` Tetsuya Mukawa 0 siblings, 0 replies; 6+ messages in thread From: Tetsuya Mukawa @ 2014-11-17 6:07 UTC (permalink / raw) To: dev --- lib/librte_vhost/Makefile | 6 +- lib/librte_vhost/rte_virtio_net.h | 22 ++++- lib/librte_vhost/vhost-cuse/vhost-net-cdev.c | 6 +- lib/librte_vhost/vhost-cuse/vhost-net-cdev.h | 40 +++++++++ lib/librte_vhost/vhost-cuse/virtio-net-cdev.c | 1 + lib/librte_vhost/vhost-net.c | 101 +++++++++++++++++++++++ lib/librte_vhost/vhost-net.h | 114 ++++++++++++++++++++++++++ lib/librte_vhost/vhost-user/vhost-net-user.c | 6 +- lib/librte_vhost/vhost-user/vhost-net-user.h | 3 + lib/librte_vhost/vhost-user/virtio-net-user.c | 1 + 10 files changed, 290 insertions(+), 10 deletions(-) create mode 100644 lib/librte_vhost/vhost-cuse/vhost-net-cdev.h create mode 100644 lib/librte_vhost/vhost-net.c create mode 100644 lib/librte_vhost/vhost-net.h diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index cb4e172..4363a14 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -37,11 +37,11 @@ LIB = librte_vhost.a CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I. -I vhost-user -I vhost-cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse LDFLAGS += -lfuse # all source are stored in SRCS-y -#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-cuse/vhost-net-cdev.c vhost-cuse/virtio-net-cdev.c +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-cuse/vhost-net-cdev.c vhost-cuse/virtio-net-cdev.c -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-user/fd_man.c vhost-user/vhost-net-user.c vhost-user/virtio-net-user.c +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost-user/fd_man.c vhost-user/vhost-net-user.c vhost-user/virtio-net-user.c -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net.c vhost_rxtx.c +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net.c vhost_rxtx.c vhost-net.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index a09533d..116c7e9 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -140,6 +140,23 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) } /** + * Enum for vhost driver types. + */ +enum rte_vhost_driver_t { + VHOST_DRV_CUSE, /* vhost-cuse driver */ + VHOST_DRV_USER, /* vhost-user driver */ + VHOST_DRV_NUM /* the number of vhost driver types */ +}; + +/** + * Structure contains information relating vhost driver. + */ +struct rte_vhost_driver { + enum rte_vhost_driver_t type; /**< driver type. */ + const char *dev_name; /**< accessing device name. */ +}; + +/** * Disable features in feature_mask. Returns 0 on success. */ int rte_vhost_feature_disable(uint64_t feature_mask); @@ -155,12 +172,13 @@ uint64_t rte_vhost_feature_get(void); int rte_vhost_enable_guest_notification(struct virtio_net *dev, uint16_t queue_id, int enable); /* Register vhost driver. dev_name could be different for multiple instance support. */ -int rte_vhost_driver_register(const char *dev_name); +struct rte_vhost_driver *rte_vhost_driver_register( + const char *dev_name, enum rte_vhost_driver_t type); /* Register callbacks. */ int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); -int rte_vhost_driver_session_start(void); +int rte_vhost_driver_session_start(struct rte_vhost_driver *drv); /** * This function adds buffers to the virtio devices RX virtqueue. Buffers can diff --git a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c index 688ec00..6ea54ee 100644 --- a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c +++ b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c @@ -47,6 +47,7 @@ #include "virtio-net-cdev.h" #include "vhost-net.h" +#include "vhost-net-cdev.h" #include "eventfd_link/eventfd_link.h" #define FUSE_OPT_DUMMY "\0\0" @@ -373,8 +374,9 @@ static const struct cuse_lowlevel_ops vhost_net_ops = { * vhost_net_device_ops are also passed when the device is registered in app. */ int -rte_vhost_driver_register(const char *dev_name) +vhost_cuse_driver_register(struct rte_vhost_driver *drv) { + const char *dev_name = drv->dev_name; struct cuse_info cuse_info; char device_name[PATH_MAX] = ""; char char_device_name[PATH_MAX] = ""; @@ -428,7 +430,7 @@ rte_vhost_driver_register(const char *dev_name) * release and ioctl calls. */ int -rte_vhost_driver_session_start(void) +vhost_cuse_driver_session_start(void) { fuse_session_loop(session); diff --git a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.h b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.h new file mode 100644 index 0000000..cb094ee --- /dev/null +++ b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.h @@ -0,0 +1,40 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2014 IGEL Co.,Ltd. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IGEL nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_CDEV_H +#define _VHOST_NET_CDEV_H + +int vhost_cuse_driver_register(struct rte_vhost_driver *drv); +int vhost_cuse_driver_session_start(void); + +#endif diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c index 7381140..42a6b24 100644 --- a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c @@ -46,6 +46,7 @@ #include <errno.h> #include <rte_log.h> +#include <rte_virtio_net.h> #include "vhost-net.h" #include "virtio-net-cdev.h" diff --git a/lib/librte_vhost/vhost-net.c b/lib/librte_vhost/vhost-net.c new file mode 100644 index 0000000..7a4537d --- /dev/null +++ b/lib/librte_vhost/vhost-net.c @@ -0,0 +1,101 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2014 IGEL Co.,Ltd. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IGEL nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <rte_malloc.h> +#include <rte_virtio_net.h> + +#include "vhost-cuse/vhost-net-cdev.h" +#include "vhost-user/vhost-net-user.h" + +/** + * This function abstracts cuse and vhost-user driver registration. + */ +struct rte_vhost_driver * +rte_vhost_driver_register(const char *dev_name, enum rte_vhost_driver_t type) +{ + int ret; + struct rte_vhost_driver *drv; + + drv = rte_zmalloc(dev_name, sizeof(struct rte_vhost_driver), + CACHE_LINE_SIZE); + if (drv == NULL) + return NULL; + + drv->dev_name = dev_name; + drv->type = type; + + switch (type) { + case VHOST_DRV_CUSE: + ret = vhost_cuse_driver_register(drv); + if (ret != 0) + goto err; + break; + case VHOST_DRV_USER: + ret = vhost_user_driver_register(drv); + if (ret != 0) + goto err; + break; + default: + break; + } + + return drv; +err: + free(drv); + return NULL; +} + +/** + * The session is launched allowing the application to + * receive open, release and ioctl calls. + */ +int +rte_vhost_driver_session_start(struct rte_vhost_driver *drv) +{ + if (drv == NULL) + return -ENODEV; + + switch (drv->type) { + case VHOST_DRV_CUSE: + vhost_cuse_driver_session_start(); + break; + case VHOST_DRV_USER: + vhost_user_driver_session_start(); + break; + default: + break; + } + + return 0; +} diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h new file mode 100644 index 0000000..881a45f --- /dev/null +++ b/lib/librte_vhost/vhost-net.h @@ -0,0 +1,114 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_H_ +#define _VHOST_NET_H_ +#include <stdint.h> +#include <stdio.h> +#include <sys/types.h> +#include <unistd.h> +#include <linux/vhost.h> + +#include <rte_log.h> + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define LOG_LEVEL RTE_LOG_DEBUG +#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define VHOST_PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define LOG_LEVEL RTE_LOG_INFO +#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define VHOST_PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + + +/* + * Structure used to identify device context. + */ +struct vhost_device_ctx { + pid_t pid; /* PID of process calling the IOCTL. */ + uint64_t fh; /* Populated with fi->fh to track the device index. */ +}; + +/* + * Structure contains function pointers to be defined in virtio-net.c. These + * functions are called in CUSE context and are used to configure devices. + */ +struct vhost_net_device_ops { + int (*new_device)(struct vhost_device_ctx); + void (*destroy_device)(struct vhost_device_ctx); + + int (*get_features)(struct vhost_device_ctx, uint64_t *); + int (*set_features)(struct vhost_device_ctx, uint64_t *); + + int (*set_mem_table)(struct vhost_device_ctx, + const struct virtio_memory_regions *, uint32_t); + + int (*set_vring_num)(struct vhost_device_ctx, struct vhost_vring_state *); + int (*set_vring_addr)(struct vhost_device_ctx, struct vhost_vring_addr *); + int (*set_vring_base)(struct vhost_device_ctx, struct vhost_vring_state *); + int (*get_vring_base)(struct vhost_device_ctx, uint32_t, struct vhost_vring_state *); + + int (*set_vring_kick)(struct vhost_device_ctx, struct vhost_vring_file *); + int (*set_vring_call)(struct vhost_device_ctx, struct vhost_vring_file *); + + int (*set_backend)(struct vhost_device_ctx, struct vhost_vring_file *); + + int (*set_owner)(struct vhost_device_ctx); + int (*reset_owner)(struct vhost_device_ctx); +}; + + +struct vhost_net_device_ops const *get_virtio_net_callbacks(void); +#endif /* _VHOST_NET_H_ */ diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.c b/lib/librte_vhost/vhost-user/vhost-net-user.c index 0b100ba..837f840 100644 --- a/lib/librte_vhost/vhost-user/vhost-net-user.c +++ b/lib/librte_vhost/vhost-user/vhost-net-user.c @@ -371,9 +371,9 @@ vserver_message_handler(int connfd, uint64_t dat) * Creates and initialise the vhost server. */ int -rte_vhost_driver_register(const char *path) +vhost_user_driver_register(struct rte_vhost_driver *drv) { - + const char *path = drv->dev_name; struct vhost_server *vserver; if (g_vhost_server != NULL) @@ -408,7 +408,7 @@ rte_vhost_driver_register(const char *path) int -rte_vhost_driver_session_start(void) +vhost_user_driver_session_start(void) { fdset_event_dispatch(&g_vhost_server->fdset); return 0; diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.h b/lib/librte_vhost/vhost-user/vhost-net-user.h index c9df9fa..d90c147 100644 --- a/lib/librte_vhost/vhost-user/vhost-net-user.h +++ b/lib/librte_vhost/vhost-user/vhost-net-user.h @@ -71,4 +71,7 @@ typedef struct VhostUserMsg { #define VHOST_USER_VERSION (0x1) /*****************************************************************************/ +int vhost_user_driver_register(struct rte_vhost_driver *drv); +int vhost_user_driver_session_start(void); + #endif diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.c b/lib/librte_vhost/vhost-user/virtio-net-user.c index 4103977..f839219 100644 --- a/lib/librte_vhost/vhost-user/virtio-net-user.c +++ b/lib/librte_vhost/vhost-user/virtio-net-user.c @@ -38,6 +38,7 @@ #include <sys/mman.h> #include <rte_log.h> +#include <rte_virtio_net.h> #include "virtio-net-user.h" #include "vhost-net-user.h" -- 1.9.1 ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2014-11-17 6:01 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2014-11-15 1:14 [dpdk-dev] [PATCH RFC] lib/librte_vhost: vhost-user Huawei Xie 2014-11-17 6:04 ` Tetsuya Mukawa 2014-11-17 6:11 ` Tetsuya Mukawa 2014-11-17 6:06 ` [dpdk-dev] [RFC PATCH] lib/librte_vhost: cleanup white spaces, tabs and indents Tetsuya Mukawa 2014-11-17 6:07 ` [dpdk-dev] [RFC PATCH 1/2] lib/librte_vhost: change macro name of include guard Tetsuya Mukawa 2014-11-17 6:07 ` [dpdk-dev] [RFC PATCH 2/2] lib/librte_vhost: Add device abstraction layer Tetsuya Mukawa
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).