From: Stefan Hajnoczi <stefanha@redhat.com>
To: dev@dpdk.org
Cc: maxime.coquelin@redhat.com, Yuanhan Liu <yliu@fridaylinux.org>,
wei.w.wang@intel.com, mst@redhat.com, zhiyong.yang@intel.com,
jasowang@redhat.com, Stefan Hajnoczi <stefanha@redhat.com>
Subject: [dpdk-dev] [RFC 17/24] vhost: add virtio-vhost-user transport
Date: Fri, 19 Jan 2018 13:44:37 +0000 [thread overview]
Message-ID: <20180119134444.24927-18-stefanha@redhat.com> (raw)
In-Reply-To: <20180119134444.24927-1-stefanha@redhat.com>
This patch adds a new transport to librte_vhost for the
virtio-vhost-user device. This device replaces the AF_UNIX socket used
by traditional vhost-user with a virtio device that tunnels vhost-user
protocol messages. This allows a guest to act as a vhost device backend
for other guests.
The intended use case is for running DPDK inside a guest. Other guests
can communicate via DPDK's "vhost" vdev driver.
For more information on virtio-vhost-user, see
https://wiki.qemu.org/Features/VirtioVhostUser.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
drivers/librte_vhost/Makefile | 1 +
drivers/librte_vhost/vhost.h | 3 +
drivers/librte_vhost/virtio_vhost_user.h | 18 +
drivers/librte_vhost/trans_virtio_vhost_user.c | 1050 ++++++++++++++++++++++++
4 files changed, 1072 insertions(+)
create mode 100644 drivers/librte_vhost/virtio_vhost_user.h
create mode 100644 drivers/librte_vhost/trans_virtio_vhost_user.c
diff --git a/drivers/librte_vhost/Makefile b/drivers/librte_vhost/Makefile
index 8a56c32af..c8cce6fac 100644
--- a/drivers/librte_vhost/Makefile
+++ b/drivers/librte_vhost/Makefile
@@ -23,6 +23,7 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net
SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
vhost_user.c virtio_net.c \
trans_af_unix.c \
+ trans_virtio_vhost_user.c \
virtio_pci.c
# install includes
diff --git a/drivers/librte_vhost/vhost.h b/drivers/librte_vhost/vhost.h
index 08ad874ef..1d4d1d139 100644
--- a/drivers/librte_vhost/vhost.h
+++ b/drivers/librte_vhost/vhost.h
@@ -338,6 +338,9 @@ struct vhost_transport_ops {
/** The traditional AF_UNIX vhost-user protocol transport. */
extern const struct vhost_transport_ops af_unix_trans_ops;
+/** The virtio-vhost-user PCI vhost-user protocol transport. */
+extern const struct vhost_transport_ops virtio_vhost_user_trans_ops;
+
/**
* Device structure contains all configuration information relating
* to the device.
diff --git a/drivers/librte_vhost/virtio_vhost_user.h b/drivers/librte_vhost/virtio_vhost_user.h
new file mode 100644
index 000000000..baeaa7494
--- /dev/null
+++ b/drivers/librte_vhost/virtio_vhost_user.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2018 Red Hat, Inc.
+ */
+
+#ifndef _LINUX_VIRTIO_VHOST_USER_H
+#define _LINUX_VIRTIO_VHOST_USER_H
+
+#include <stdint.h>
+
+struct virtio_vhost_user_config {
+ uint32_t status;
+#define VIRTIO_VHOST_USER_STATUS_SLAVE_UP 0
+#define VIRTIO_VHOST_USER_STATUS_MASTER_UP 1
+ uint32_t max_vhost_queues;
+ uint8_t uuid[16];
+};
+
+#endif /* _LINUX_VIRTIO_VHOST_USER_H */
diff --git a/drivers/librte_vhost/trans_virtio_vhost_user.c b/drivers/librte_vhost/trans_virtio_vhost_user.c
new file mode 100644
index 000000000..df654eb71
--- /dev/null
+++ b/drivers/librte_vhost/trans_virtio_vhost_user.c
@@ -0,0 +1,1050 @@
+/* SPDX-License-Idenitifier: BSD-3-Clause
+ * Copyright (C) 2018 Red Hat, Inc.
+ */
+
+/*
+ * @file
+ * virtio-vhost-user PCI transport driver
+ *
+ * This vhost-user transport communicates with the vhost-user master process
+ * over the virtio-vhost-user PCI device.
+ *
+ * Interrupts are used since this is the control path, not the data path. This
+ * way the vhost-user command processing doesn't interfere with packet
+ * processing. This is similar to the AF_UNIX transport's fdman thread that
+ * processes socket I/O separately.
+ *
+ * This transport replaces the usual vhost-user file descriptor passing with a
+ * PCI BAR that contains doorbell registers for callfd and logfd, and shared
+ * memory for the memory table regions.
+ *
+ * VIRTIO device specification:
+ * https://stefanha.github.io/virtio/vhost-user-slave.html#x1-2830007
+ */
+
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_bus_pci.h>
+#include <rte_io.h>
+
+#include "vhost.h"
+#include "virtio_pci.h"
+#include "virtqueue.h"
+#include "virtio_vhost_user.h"
+#include "vhost_user.h"
+
+/*
+ * Data structures:
+ *
+ * Successfully probed virtio-vhost-user PCI adapters are added to
+ * vvu_pci_device_list as struct vvu_pci_device elements.
+ *
+ * When rte_vhost_driver_register() is called, a struct vvu_socket is created
+ * as the endpoint for future vhost-user connections. The struct vvu_socket is
+ * associated with the struct vvu_pci_device that will be used for
+ * communication.
+ *
+ * When a vhost-user protocol connection is established, a struct
+ * vvu_connection is created and the application's new_device(int vid) callback
+ * is invoked.
+ */
+
+/** Probed PCI devices for lookup by rte_vhost_driver_register() */
+TAILQ_HEAD(, vvu_pci_device) vvu_pci_device_list =
+ TAILQ_HEAD_INITIALIZER(vvu_pci_device_list);
+
+struct vvu_socket;
+struct vvu_connection;
+
+/** A virtio-vhost-vsock PCI adapter */
+struct vvu_pci_device {
+ struct virtio_hw hw;
+ struct rte_pci_device *pci_dev;
+ struct vvu_socket *s;
+ TAILQ_ENTRY(vvu_pci_device) next;
+};
+
+/** A vhost-user endpoint (aka per-path state) */
+struct vvu_socket {
+ struct vhost_user_socket socket; /* must be first field! */
+ struct vvu_pci_device *pdev;
+ struct vvu_connection *conn;
+
+ /** Doorbell registers */
+ uint16_t *doorbells;
+
+ /** This struct virtio_vhost_user_config field determines the number of
+ * doorbells available so we keep it saved.
+ */
+ uint32_t max_vhost_queues;
+
+ /** Receive buffers */
+ const struct rte_memzone *rxbuf_mz;
+
+ /** Transmit buffers. It is assumed that the device completes them
+ * in-order so a single wrapping index can be used to select the next
+ * free buffer.
+ */
+ const struct rte_memzone *txbuf_mz;
+ unsigned int txbuf_idx;
+};
+
+/** A vhost-user protocol session (aka per-vid state) */
+struct vvu_connection {
+ struct virtio_net device; /* must be first field! */
+ struct vvu_socket *s;
+};
+
+/** Virtio feature bits that we support */
+#define VVU_VIRTIO_FEATURES ((1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
+ (1ULL << VIRTIO_F_ANY_LAYOUT) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+
+/** Virtqueue indices */
+enum {
+ VVU_VQ_RX,
+ VVU_VQ_TX,
+ VVU_VQ_MAX,
+};
+
+enum {
+ /** Receive buffer size, in bytes */
+ VVU_RXBUF_SIZE = 1024,
+
+ /** Transmit buffer size, in bytes */
+ VVU_TXBUF_SIZE = 1024,
+};
+
+/** Look up a struct vvu_pci_device from a DomBDF string */
+static struct vvu_pci_device *
+vvu_pci_by_name(const char *name)
+{
+ struct vvu_pci_device *pdev;
+
+ TAILQ_FOREACH(pdev, &vvu_pci_device_list, next) {
+ if (!strcmp(pdev->pci_dev->device.name, name))
+ return pdev;
+ }
+ return NULL;
+}
+
+/** Start connection establishment */
+static void
+vvu_connect(struct vvu_socket *s)
+{
+ struct virtio_hw *hw = &s->pdev->hw;
+ uint32_t status;
+
+ virtio_pci_read_dev_config(hw,
+ offsetof(struct virtio_vhost_user_config, status),
+ &status, sizeof(status));
+ status |= RTE_LE32(1u << VIRTIO_VHOST_USER_STATUS_SLAVE_UP);
+ virtio_pci_write_dev_config(hw,
+ offsetof(struct virtio_vhost_user_config, status),
+ &status, sizeof(status));
+}
+
+static void
+vvu_disconnect(struct vvu_socket *s)
+{
+ struct vhost_user_socket *vsocket = &s->socket;
+ struct vvu_connection *conn = s->conn;
+ struct virtio_hw *hw = &s->pdev->hw;
+ uint32_t status;
+
+ if (conn) {
+ if (vsocket->notify_ops->destroy_connection)
+ vsocket->notify_ops->destroy_connection(conn->device.vid);
+
+ vhost_destroy_device(conn->device.vid);
+ }
+
+ /* Make sure we're disconnected */
+ virtio_pci_read_dev_config(hw,
+ offsetof(struct virtio_vhost_user_config, status),
+ &status, sizeof(status));
+ status &= ~RTE_LE32(1u << VIRTIO_VHOST_USER_STATUS_SLAVE_UP);
+ virtio_pci_write_dev_config(hw,
+ offsetof(struct virtio_vhost_user_config, status),
+ &status, sizeof(status));
+}
+
+static void
+vvu_reconnect(struct vvu_socket *s)
+{
+ vvu_disconnect(s);
+ vvu_connect(s);
+}
+
+static void vvu_process_rxq(struct vvu_socket *s);
+
+static void
+vvu_cleanup_device(struct virtio_net *dev, int destroy __rte_unused)
+{
+ struct vvu_connection *conn =
+ container_of(dev, struct vvu_connection, device);
+ struct vvu_socket *s = conn->s;
+
+ s->conn = NULL;
+ vvu_process_rxq(s); /* discard old replies from master */
+ vvu_reconnect(s);
+}
+
+static int
+vvu_vring_call(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+ struct vvu_connection *conn =
+ container_of(dev, struct vvu_connection, device);
+ struct vvu_socket *s = conn->s;
+ uint16_t vq_idx = vq->vring_idx;
+
+ RTE_LOG(DEBUG, VHOST_CONFIG, "%s vq_idx %u\n", __func__, vq_idx);
+
+ rte_write16(rte_cpu_to_le_16(vq_idx), &s->doorbells[vq_idx]);
+ return 0;
+}
+
+static int
+vvu_send_reply(struct virtio_net *dev, struct VhostUserMsg *reply)
+{
+ struct vvu_connection *conn =
+ container_of(dev, struct vvu_connection, device);
+ struct vvu_socket *s = conn->s;
+ struct virtqueue *vq = s->pdev->hw.vqs[VVU_VQ_TX];
+ struct vring_desc *desc;
+ struct vq_desc_extra *descx;
+ unsigned int i;
+ void *buf;
+ size_t len;
+
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "%s request %u flags %#x size %u\n",
+ __func__, reply->request.master,
+ reply->flags, reply->size);
+
+ /* TODO convert reply to little-endian */
+
+ if (virtqueue_full(vq)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Out of tx buffers\n");
+ return -1;
+ }
+
+ i = s->txbuf_idx;
+ len = VHOST_USER_HDR_SIZE + reply->size;
+ buf = (uint8_t *)s->txbuf_mz->addr + i * VVU_TXBUF_SIZE;
+
+ memcpy(buf, reply, len);
+
+ desc = &vq->vq_ring.desc[i];
+ descx = &vq->vq_descx[i];
+
+ desc->addr = rte_cpu_to_le_64(s->txbuf_mz->iova + i * VVU_TXBUF_SIZE);
+ desc->len = rte_cpu_to_le_32(len);
+ desc->flags = 0;
+
+ descx->cookie = buf;
+ descx->ndescs = 1;
+
+ vq->vq_free_cnt--;
+ s->txbuf_idx = (s->txbuf_idx + 1) & (vq->vq_nentries - 1);
+
+ vq_update_avail_ring(vq, i);
+ vq_update_avail_idx(vq);
+
+ if (virtqueue_kick_prepare(vq))
+ virtqueue_notify(vq);
+
+ return 0;
+}
+
+static int
+vvu_map_mem_regions(struct virtio_net *dev)
+{
+ struct vvu_connection *conn =
+ container_of(dev, struct vvu_connection, device);
+ struct vvu_socket *s = conn->s;
+ struct rte_pci_device *pci_dev = s->pdev->pci_dev;
+ uint8_t *mmap_addr;
+ uint32_t i;
+
+ /* Memory regions start after the doorbell registers */
+ mmap_addr = (uint8_t *)pci_dev->mem_resource[2].addr +
+ RTE_ALIGN_CEIL((s->max_vhost_queues + 1 /* log fd */) *
+ sizeof(uint16_t), 4096);
+
+ for (i = 0; i < dev->mem->nregions; i++) {
+ struct rte_vhost_mem_region *reg = &dev->mem->regions[i];
+
+ reg->mmap_addr = mmap_addr;
+ reg->host_user_addr = (uint64_t)(uintptr_t)reg->mmap_addr +
+ reg->mmap_size - reg->size;
+
+ mmap_addr += reg->mmap_size;
+ }
+
+ return 0;
+}
+
+static void
+vvu_unmap_mem_regions(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ for (i = 0; i < dev->mem->nregions; i++) {
+ struct rte_vhost_mem_region *reg = &dev->mem->regions[i];
+
+ /* Just clear the pointers, the PCI BAR stays there */
+ reg->mmap_addr = NULL;
+ reg->host_user_addr = 0;
+ }
+}
+
+static void vvu_process_new_connection(struct vvu_socket *s)
+{
+ struct vhost_user_socket *vsocket = &s->socket;
+ struct vvu_connection *conn;
+ struct virtio_net *dev;
+ size_t size;
+
+ dev = vhost_new_device(vsocket->trans_ops);
+ if (!dev) {
+ vvu_reconnect(s);
+ return;
+ }
+
+ conn = container_of(dev, struct vvu_connection, device);
+ conn->s = s;
+
+ size = strnlen(vsocket->path, PATH_MAX);
+ vhost_set_ifname(dev->vid, vsocket->path, size);
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", dev->vid);
+
+ if (vsocket->notify_ops->new_connection) {
+ int ret = vsocket->notify_ops->new_connection(dev->vid);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add vhost user connection\n");
+ vhost_destroy_device(dev->vid);
+ vvu_reconnect(s);
+ return;
+ }
+ }
+
+ s->conn = conn;
+ return;
+}
+
+static void vvu_process_status_change(struct vvu_socket *s, bool slave_up,
+ bool master_up)
+{
+ RTE_LOG(DEBUG, VHOST_CONFIG, "%s slave_up %d master_up %d\n",
+ __func__, slave_up, master_up);
+
+ /* Disconnected from the master, try reconnecting */
+ if (!slave_up) {
+ vvu_reconnect(s);
+ return;
+ }
+
+ if (master_up && !s->conn) {
+ vvu_process_new_connection(s);
+ return;
+ }
+}
+
+static void
+vvu_process_txq(struct vvu_socket *s)
+{
+ struct virtio_hw *hw = &s->pdev->hw;
+ struct virtqueue *vq = hw->vqs[VVU_VQ_TX];
+ uint16_t n = VIRTQUEUE_NUSED(vq);
+
+ virtio_rmb();
+
+ /* Just mark the buffers complete */
+ vq->vq_used_cons_idx += n;
+ vq->vq_free_cnt += n;
+}
+
+static void
+vvu_process_rxq(struct vvu_socket *s)
+{
+ struct virtio_hw *hw = &s->pdev->hw;
+ struct virtqueue *vq = hw->vqs[VVU_VQ_RX];
+ bool refilled = false;
+
+ while (VIRTQUEUE_NUSED(vq)) {
+ struct vring_used_elem *uep;
+ VhostUserMsg *msg;
+ uint32_t len;
+ uint32_t desc_idx;
+ uint16_t used_idx;
+ size_t i;
+
+ virtio_rmb();
+
+ used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ uep = &vq->vq_ring.used->ring[used_idx];
+ desc_idx = rte_le_to_cpu_32(uep->id);
+
+ msg = vq->vq_descx[desc_idx].cookie;
+ len = rte_le_to_cpu_32(uep->len);
+
+ if (msg->size > sizeof(VhostUserMsg) ||
+ len != VHOST_USER_HDR_SIZE + msg->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Invalid vhost-user message size %u, got %u bytes\n",
+ msg->size, len);
+ /* TODO reconnect */
+ abort();
+ }
+
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ "%s request %u flags %#x size %u\n",
+ __func__, msg->request.master,
+ msg->flags, msg->size);
+
+ /* Mark file descriptors invalid */
+ for (i = 0; i < RTE_DIM(msg->fds); i++)
+ msg->fds[i] = VIRTIO_INVALID_EVENTFD;
+
+ /* Only process messages while connected */
+ if (s->conn) {
+ if (vhost_user_msg_handler(s->conn->device.vid,
+ msg) < 0) {
+ /* TODO reconnect */
+ abort();
+ }
+ }
+
+ vq->vq_used_cons_idx++;
+
+ /* Refill rxq */
+ vq_update_avail_ring(vq, desc_idx);
+ vq_update_avail_idx(vq);
+ refilled = true;
+ }
+
+ if (!refilled)
+ return;
+ if (virtqueue_kick_prepare(vq))
+ virtqueue_notify(vq);
+}
+
+/* TODO Audit thread safety. There are 3 threads involved:
+ * 1. The main process thread that calls librte_vhost APIs during startup.
+ * 2. The interrupt thread that calls vvu_interrupt_handler().
+ * 3. Packet processing threads (lcores) calling librte_vhost APIs.
+ *
+ * It may be necessary to use locks if any of these code paths can race. The
+ * librte_vhost API entry points already do some locking but this needs to be
+ * checked.
+ */
+static void
+vvu_interrupt_handler(void *cb_arg)
+{
+ struct vvu_socket *s = cb_arg;
+ struct virtio_hw *hw = &s->pdev->hw;
+ struct rte_intr_handle *intr_handle = &s->pdev->pci_dev->intr_handle;
+ uint8_t isr;
+
+ /* Read Interrupt Status Register (which also clears it) */
+ isr = VTPCI_OPS(hw)->get_isr(hw);
+
+ if (isr & VIRTIO_PCI_ISR_CONFIG) {
+ uint32_t status;
+ bool slave_up;
+ bool master_up;
+
+ virtio_pci_read_dev_config(hw,
+ offsetof(struct virtio_vhost_user_config, status),
+ &status, sizeof(status));
+ status = rte_le_to_cpu_32(status);
+
+ RTE_LOG(DEBUG, VHOST_CONFIG, "%s isr %#x status %#x\n", __func__, isr, status);
+
+ slave_up = status & (1u << VIRTIO_VHOST_USER_STATUS_SLAVE_UP);
+ master_up = status & (1u << VIRTIO_VHOST_USER_STATUS_MASTER_UP);
+ vvu_process_status_change(s, slave_up, master_up);
+ } else
+ RTE_LOG(DEBUG, VHOST_CONFIG, "%s isr %#x\n", __func__, isr);
+
+ /* Re-arm before processing virtqueues so no interrupts are lost */
+ rte_intr_enable(intr_handle);
+
+ vvu_process_txq(s);
+ vvu_process_rxq(s);
+}
+
+static int
+vvu_virtio_pci_init_rxq(struct vvu_socket *s)
+{
+ char name[sizeof("0000:00:00.00 vq 0 rxbufs")];
+ struct virtqueue *vq;
+ size_t size;
+ size_t align;
+ int i;
+
+ vq = s->pdev->hw.vqs[VVU_VQ_RX];
+
+ snprintf(name, sizeof(name), "%s vq %u rxbufs",
+ s->pdev->pci_dev->device.name, VVU_VQ_RX);
+
+ /* Allocate more than sizeof(VhostUserMsg) so there is room to grow */
+ size = vq->vq_nentries * VVU_RXBUF_SIZE;
+ align = 1024;
+ s->rxbuf_mz = rte_memzone_reserve_aligned(name, size, SOCKET_ID_ANY,
+ 0, align);
+ if (!s->rxbuf_mz) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate rxbuf memzone\n");
+ return -1;
+ }
+
+ for (i = 0; i < vq->vq_nentries; i++) {
+ struct vring_desc *desc = &vq->vq_ring.desc[i];
+ struct vq_desc_extra *descx = &vq->vq_descx[i];
+
+ desc->addr = rte_cpu_to_le_64(s->rxbuf_mz->iova +
+ i * VVU_RXBUF_SIZE);
+ desc->len = RTE_LE32(VVU_RXBUF_SIZE);
+ desc->flags = RTE_LE16(VRING_DESC_F_WRITE);
+
+ descx->cookie = (uint8_t *)s->rxbuf_mz->addr + i * VVU_RXBUF_SIZE;
+ descx->ndescs = 1;
+
+ vq_update_avail_ring(vq, i);
+ vq->vq_free_cnt--;
+ }
+
+ vq_update_avail_idx(vq);
+ virtqueue_notify(vq);
+ return 0;
+}
+
+static int
+vvu_virtio_pci_init_txq(struct vvu_socket *s)
+{
+ char name[sizeof("0000:00:00.00 vq 0 txbufs")];
+ struct virtqueue *vq;
+ size_t size;
+ size_t align;
+
+ vq = s->pdev->hw.vqs[VVU_VQ_TX];
+
+ snprintf(name, sizeof(name), "%s vq %u txbufs",
+ s->pdev->pci_dev->device.name, VVU_VQ_TX);
+
+ /* Allocate more than sizeof(VhostUserMsg) so there is room to grow */
+ size = vq->vq_nentries * VVU_TXBUF_SIZE;
+ align = 1024;
+ s->txbuf_mz = rte_memzone_reserve_aligned(name, size, SOCKET_ID_ANY,
+ 0, align);
+ if (!s->txbuf_mz) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate txbuf memzone\n");
+ return -1;
+ }
+
+ s->txbuf_idx = 0;
+ return 0;
+}
+
+static void
+virtio_init_vring(struct virtqueue *vq)
+{
+ int size = vq->vq_nentries;
+ struct vring *vr = &vq->vq_ring;
+ uint8_t *ring_mem = vq->vq_ring_virt_mem;
+
+ memset(ring_mem, 0, vq->vq_ring_size);
+ vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN);
+ vq->vq_used_cons_idx = 0;
+ vq->vq_desc_head_idx = 0;
+ vq->vq_avail_idx = 0;
+ vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1);
+ vq->vq_free_cnt = vq->vq_nentries;
+ memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
+
+ vring_desc_init(vr->desc, size);
+ virtqueue_enable_intr(vq);
+}
+
+static int
+vvu_virtio_pci_init_vq(struct vvu_socket *s, int vq_idx)
+{
+ char vq_name[sizeof("0000:00:00.00 vq 0")];
+ struct virtio_hw *hw = &s->pdev->hw;
+ const struct rte_memzone *mz;
+ struct virtqueue *vq;
+ uint16_t q_num;
+ size_t size;
+
+ q_num = VTPCI_OPS(hw)->get_queue_num(hw, vq_idx);
+ RTE_LOG(DEBUG, VHOST_CONFIG, "vq %d q_num: %u\n", vq_idx, q_num);
+ if (q_num == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "virtqueue %d does not exist\n",
+ vq_idx);
+ return -1;
+ }
+
+ if (!rte_is_power_of_2(q_num)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "virtqueue %d has non-power of 2 size (%u)\n",
+ vq_idx, q_num);
+ return -1;
+ }
+
+ snprintf(vq_name, sizeof(vq_name), "%s vq %u",
+ s->pdev->pci_dev->device.name, vq_idx);
+
+ size = RTE_ALIGN_CEIL(sizeof(*vq) +
+ q_num * sizeof(struct vq_desc_extra),
+ RTE_CACHE_LINE_SIZE);
+ vq = rte_zmalloc(vq_name, size, RTE_CACHE_LINE_SIZE);
+ if (!vq) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocated virtqueue %d\n", vq_idx);
+ return -1;
+ }
+ hw->vqs[vq_idx] = vq;
+
+ vq->hw = hw;
+ vq->vq_queue_index = vq_idx;
+ vq->vq_nentries = q_num;
+
+ size = vring_size(q_num, VIRTIO_PCI_VRING_ALIGN);
+ vq->vq_ring_size = RTE_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
+
+ mz = rte_memzone_reserve_aligned(vq_name, vq->vq_ring_size,
+ SOCKET_ID_ANY, 0,
+ VIRTIO_PCI_VRING_ALIGN);
+ if (mz == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to reserve memzone for virtqueue %d\n",
+ vq_idx);
+ goto err_vq;
+ }
+
+ memset(mz->addr, 0, mz->len);
+
+ vq->mz = mz;
+ vq->vq_ring_mem = mz->iova;
+ vq->vq_ring_virt_mem = mz->addr;
+ virtio_init_vring(vq);
+
+ if (VTPCI_OPS(hw)->setup_queue(hw, vq) < 0)
+ goto err_mz;
+
+ return 0;
+
+err_mz:
+ rte_memzone_free(mz);
+
+err_vq:
+ hw->vqs[vq_idx] = NULL;
+ rte_free(vq);
+ return -1;
+}
+
+static void
+vvu_virtio_pci_free_virtqueues(struct vvu_socket *s)
+{
+ struct virtio_hw *hw = &s->pdev->hw;
+ int i;
+
+ if (s->rxbuf_mz) {
+ rte_memzone_free(s->rxbuf_mz);
+ s->rxbuf_mz = NULL;
+ }
+ if (s->txbuf_mz) {
+ rte_memzone_free(s->txbuf_mz);
+ s->txbuf_mz = NULL;
+ }
+
+ for (i = 0; i < VVU_VQ_MAX; i++) {
+ struct virtqueue *vq = hw->vqs[i];
+
+ if (!vq)
+ continue;
+
+ rte_memzone_free(vq->mz);
+ rte_free(vq);
+ hw->vqs[i] = NULL;
+ }
+
+ rte_free(hw->vqs);
+ hw->vqs = NULL;
+}
+
+static void
+vvu_virtio_pci_intr_cleanup(struct vvu_socket *s)
+{
+ struct virtio_hw *hw = &s->pdev->hw;
+ struct rte_intr_handle *intr_handle = &s->pdev->pci_dev->intr_handle;
+ int i;
+
+ for (i = 0; i < VVU_VQ_MAX; i++)
+ VTPCI_OPS(hw)->set_queue_irq(hw, hw->vqs[i],
+ VIRTIO_MSI_NO_VECTOR);
+ VTPCI_OPS(hw)->set_config_irq(hw, VIRTIO_MSI_NO_VECTOR);
+ rte_intr_disable(intr_handle);
+ rte_intr_callback_unregister(intr_handle, vvu_interrupt_handler, s);
+ rte_intr_efd_disable(intr_handle);
+}
+
+static int
+vvu_virtio_pci_init_intr(struct vvu_socket *s)
+{
+ struct virtio_hw *hw = &s->pdev->hw;
+ struct rte_intr_handle *intr_handle = &s->pdev->pci_dev->intr_handle;
+ int i;
+
+ if (!rte_intr_cap_multiple(intr_handle)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Multiple intr vector not supported\n");
+ return -1;
+ }
+
+ if (rte_intr_efd_enable(intr_handle, VVU_VQ_MAX) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to create eventfds\n");
+ return -1;
+ }
+
+ if (rte_intr_callback_register(intr_handle, vvu_interrupt_handler, s) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to register interrupt callback\n");
+ goto err_efd;
+ }
+
+ if (rte_intr_enable(intr_handle) < 0)
+ goto err_callback;
+
+ if (VTPCI_OPS(hw)->set_config_irq(hw, 0) == VIRTIO_MSI_NO_VECTOR) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to set config MSI-X vector\n");
+ goto err_enable;
+ }
+
+ /* TODO use separate vectors and interrupt handler functions. It seems
+ * <rte_interrupts.h> doesn't allow efds to have interrupt_handler
+ * functions and it just clears efds when they are raised. As a
+ * workaround we use the configuration change interrupt for virtqueue
+ * interrupts!
+ */
+ for (i = 0; i < VVU_VQ_MAX; i++) {
+ if (VTPCI_OPS(hw)->set_queue_irq(hw, hw->vqs[i], 0) ==
+ VIRTIO_MSI_NO_VECTOR) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to set virtqueue MSI-X vector\n");
+ goto err_vq;
+ }
+ }
+
+ return 0;
+
+err_vq:
+ for (i = 0; i < VVU_VQ_MAX; i++)
+ VTPCI_OPS(hw)->set_queue_irq(hw, hw->vqs[i],
+ VIRTIO_MSI_NO_VECTOR);
+ VTPCI_OPS(hw)->set_config_irq(hw, VIRTIO_MSI_NO_VECTOR);
+err_enable:
+ rte_intr_disable(intr_handle);
+err_callback:
+ rte_intr_callback_unregister(intr_handle, vvu_interrupt_handler, s);
+err_efd:
+ rte_intr_efd_disable(intr_handle);
+ return -1;
+}
+
+static int
+vvu_virtio_pci_init_bar(struct vvu_socket *s)
+{
+ struct rte_pci_device *pci_dev = s->pdev->pci_dev;
+ struct virtio_net *dev = NULL; /* just for sizeof() */
+
+ s->doorbells = pci_dev->mem_resource[2].addr;
+ if (!s->doorbells) {
+ RTE_LOG(ERR, VHOST_CONFIG, "BAR 2 not availabled\n");
+ return -1;
+ }
+
+ /* The number of doorbells is max_vhost_queues + 1 */
+ virtio_pci_read_dev_config(&s->pdev->hw,
+ offsetof(struct virtio_vhost_user_config,
+ max_vhost_queues),
+ &s->max_vhost_queues,
+ sizeof(s->max_vhost_queues));
+ s->max_vhost_queues = rte_le_to_cpu_32(s->max_vhost_queues);
+ if (s->max_vhost_queues < RTE_DIM(dev->virtqueue)) {
+ /* We could support devices with a smaller max number of
+ * virtqueues than dev->virtqueue[] in the future. Fail early
+ * for now since the current assumption is that all of
+ * dev->virtqueue[] can be used.
+ */
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Device supports fewer virtqueues than driver!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vvu_virtio_pci_init(struct vvu_socket *s)
+{
+ uint64_t host_features;
+ struct virtio_hw *hw = &s->pdev->hw;
+ int i;
+
+ virtio_pci_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+ virtio_pci_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+
+ hw->guest_features = VVU_VIRTIO_FEATURES;
+ host_features = VTPCI_OPS(hw)->get_features(hw);
+ hw->guest_features = virtio_pci_negotiate_features(hw, host_features);
+
+ if (!virtio_pci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Missing VIRTIO 1 feature bit\n");
+ goto err;
+ }
+
+ virtio_pci_set_status(hw, VIRTIO_CONFIG_STATUS_FEATURES_OK);
+ if (!(virtio_pci_get_status(hw) & VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Failed to set FEATURES_OK\n");
+ goto err;
+ }
+
+ if (vvu_virtio_pci_init_bar(s) < 0)
+ goto err;
+
+ hw->vqs = rte_zmalloc(NULL, sizeof(struct virtqueue *) * VVU_VQ_MAX, 0);
+ if (!hw->vqs)
+ goto err;
+
+ for (i = 0; i < VVU_VQ_MAX; i++) {
+ if (vvu_virtio_pci_init_vq(s, i) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "virtqueue %u init failed\n", i);
+ goto err_init_vq;
+ }
+ }
+
+ if (vvu_virtio_pci_init_rxq(s) < 0)
+ goto err_init_vq;
+
+ if (vvu_virtio_pci_init_txq(s) < 0)
+ goto err_init_vq;
+
+ if (vvu_virtio_pci_init_intr(s) < 0)
+ goto err_init_vq;
+
+ virtio_pci_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+
+ return 0;
+
+err_init_vq:
+ vvu_virtio_pci_free_virtqueues(s);
+
+err:
+ virtio_pci_reset(hw);
+ RTE_LOG(DEBUG, VHOST_CONFIG, "%s failed\n", __func__);
+ return -1;
+}
+
+static int
+vvu_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+ struct rte_pci_device *pci_dev)
+{
+ struct vvu_pci_device *pdev;
+
+ /* TODO support multi-process applications */
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "virtio-vhost-pci does not support multi-process "
+ "applications\n");
+ return -1;
+ }
+
+ pdev = rte_zmalloc_socket(pci_dev->device.name, sizeof(*pdev),
+ RTE_CACHE_LINE_SIZE,
+ pci_dev->device.numa_node);
+ if (!pdev)
+ return -1;
+
+ pdev->pci_dev = pci_dev;
+
+ if (virtio_pci_init(pci_dev, &pdev->hw) != 0) {
+ rte_free(pdev);
+ return -1;
+ }
+
+ /* Reset the device now, the rest is done in vvu_socket_init() */
+ virtio_pci_reset(&pdev->hw);
+
+ if (pdev->hw.use_msix == VIRTIO_MSIX_NONE) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "MSI-X is required for PCI device at %s\n",
+ pci_dev->device.name);
+ rte_free(pdev);
+ rte_pci_unmap_device(pci_dev);
+ return -1;
+ }
+
+ TAILQ_INSERT_TAIL(&vvu_pci_device_list, pdev, next);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "Added virtio-vhost-user device at %s\n",
+ pci_dev->device.name);
+
+ return 0;
+}
+
+static int
+vvu_pci_remove(struct rte_pci_device *pci_dev)
+{
+ struct vvu_pci_device *pdev;
+
+ TAILQ_FOREACH(pdev, &vvu_pci_device_list, next)
+ if (pdev->pci_dev == pci_dev)
+ break;
+ if (!pdev)
+ return -1;
+
+ if (pdev->s) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Cannot remove PCI device at %s with vhost still attached\n",
+ pci_dev->device.name);
+ return -1;
+ }
+
+ TAILQ_REMOVE(&vvu_pci_device_list, pdev, next);
+ rte_free(pdev);
+ rte_pci_unmap_device(pci_dev);
+ return 0;
+}
+
+static const struct rte_pci_id pci_id_vvu_map[] = {
+ { RTE_PCI_DEVICE(VIRTIO_PCI_VENDORID,
+ VIRTIO_PCI_LEGACY_DEVICEID_VHOST_USER) },
+ { RTE_PCI_DEVICE(VIRTIO_PCI_VENDORID,
+ VIRTIO_PCI_MODERN_DEVICEID_VHOST_USER) },
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+static struct rte_pci_driver vvu_pci_driver = {
+ .driver = {
+ .name = "virtio_vhost_user",
+ },
+ .id_table = pci_id_vvu_map,
+ .drv_flags = 0,
+ .probe = vvu_pci_probe,
+ .remove = vvu_pci_remove,
+};
+
+RTE_INIT(vvu_pci_init);
+static void
+vvu_pci_init(void)
+{
+ if (rte_eal_iopl_init() != 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "IOPL call failed - cannot use virtio-vhost-user\n");
+ return;
+ }
+
+ rte_pci_register(&vvu_pci_driver);
+}
+
+static int
+vvu_socket_init(struct vhost_user_socket *vsocket, uint64_t flags)
+{
+ struct vvu_socket *s =
+ container_of(vsocket, struct vvu_socket, socket);
+ struct vvu_pci_device *pdev;
+
+ if (flags & RTE_VHOST_USER_NO_RECONNECT) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: reconnect cannot be disabled for virtio-vhost-user\n");
+ return -1;
+ }
+ if (flags & RTE_VHOST_USER_CLIENT) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: virtio-vhost-user does not support client mode\n");
+ return -1;
+ }
+ if (flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: virtio-vhost-user does not support dequeue-zero-copy\n");
+ return -1;
+ }
+
+ pdev = vvu_pci_by_name(vsocket->path);
+ if (!pdev) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Cannot find virtio-vhost-user PCI device at %s\n",
+ vsocket->path);
+ return -1;
+ }
+
+ if (pdev->s) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Device at %s is already in use\n",
+ vsocket->path);
+ return -1;
+ }
+
+ s->pdev = pdev;
+ pdev->s = s;
+
+ if (vvu_virtio_pci_init(s) < 0) {
+ s->pdev = NULL;
+ pdev->s = NULL;
+ return -1;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "%s at %s\n", __func__, vsocket->path);
+ return 0;
+}
+
+static void
+vvu_socket_cleanup(struct vhost_user_socket *vsocket)
+{
+ struct vvu_socket *s =
+ container_of(vsocket, struct vvu_socket, socket);
+
+ if (s->conn)
+ vhost_destroy_device(s->conn->device.vid);
+
+ vvu_virtio_pci_intr_cleanup(s);
+ virtio_pci_reset(&s->pdev->hw);
+ vvu_virtio_pci_free_virtqueues(s);
+
+ s->pdev->s = NULL;
+ s->pdev = NULL;
+}
+
+static int
+vvu_socket_start(struct vhost_user_socket *vsocket)
+{
+ struct vvu_socket *s =
+ container_of(vsocket, struct vvu_socket, socket);
+
+ vvu_connect(s);
+ return 0;
+}
+
+const struct vhost_transport_ops virtio_vhost_user_trans_ops = {
+ .socket_size = sizeof(struct vvu_socket),
+ .device_size = sizeof(struct vvu_connection),
+ .socket_init = vvu_socket_init,
+ .socket_cleanup = vvu_socket_cleanup,
+ .socket_start = vvu_socket_start,
+ .cleanup_device = vvu_cleanup_device,
+ .vring_call = vvu_vring_call,
+ .send_reply = vvu_send_reply,
+ .map_mem_regions = vvu_map_mem_regions,
+ .unmap_mem_regions = vvu_unmap_mem_regions,
+};
--
2.14.3
next prev parent reply other threads:[~2018-01-19 13:47 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-01-19 13:44 [dpdk-dev] [RFC 00/24] " Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 01/24] vhost: move vring_call() into trans_af_unix.c Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 02/24] vhost: move AF_UNIX code from socket.c to trans_af_unix.c Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 03/24] vhost: allocate per-socket transport state Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 04/24] vhost: move socket_fd and un sockaddr into trans_af_unix.c Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 05/24] vhost: move start_server/client() calls to trans_af_unix.c Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 06/24] vhost: move vhost_user_connection " Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 07/24] vhost: move vhost_user_reconnect_init() into trans_af_unix.c Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 08/24] vhost: move vhost_user.fdset to trans_af_unix.c Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 09/24] vhost: pass vhost_transport_ops through vhost_new_device() Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 10/24] vhost: embed struct virtio_net inside struct vhost_user_connection Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 11/24] vhost: extract vhost_user.c socket I/O into transport Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 12/24] vhost: move slave_req_fd field to AF_UNIX transport Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 13/24] vhost: move mmap/munmap " Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 14/24] vhost: move librte_vhost to drivers/ Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 15/24] vhost: add virtio pci framework Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 16/24] vhost: remember a vhost_virtqueue's queue index Stefan Hajnoczi
2018-01-19 13:44 ` Stefan Hajnoczi [this message]
2018-01-19 13:44 ` [dpdk-dev] [RFC 18/24] vhost: add RTE_VHOST_USER_VIRTIO_TRANSPORT flag Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 19/24] net/vhost: add virtio-vhost-user support Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 20/24] examples/vhost_scsi: add --socket-file argument Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 21/24] examples/vhost_scsi: add virtio-vhost-user support Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 22/24] usertools: add virtio-vhost-user devices to dpdk-devbind.py Stefan Hajnoczi
2018-01-19 13:44 ` [dpdk-dev] [RFC 23/24] WORKAROUND revert virtio-net mq vring deletion Stefan Hajnoczi
2018-01-30 17:52 ` Maxime Coquelin
2018-01-19 13:44 ` [dpdk-dev] [RFC 24/24] WORKAROUND examples/vhost_scsi: avoid broken EVENT_IDX Stefan Hajnoczi
2018-01-19 19:31 ` Michael S. Tsirkin
2018-01-31 10:02 ` [dpdk-dev] [RFC 00/24] vhost: add virtio-vhost-user transport Maxime Coquelin
[not found] ` <20180410093847.GA22081@stefanha-x1.localdomain>
2018-04-10 14:56 ` Wang, Wei W
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180119134444.24927-18-stefanha@redhat.com \
--to=stefanha@redhat.com \
--cc=dev@dpdk.org \
--cc=jasowang@redhat.com \
--cc=maxime.coquelin@redhat.com \
--cc=mst@redhat.com \
--cc=wei.w.wang@intel.com \
--cc=yliu@fridaylinux.org \
--cc=zhiyong.yang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).