From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mx1.redhat.com (mx1.redhat.com [209.132.183.28]) by dpdk.org (Postfix) with ESMTP id 4019A2F42 for ; Sun, 16 Dec 2018 10:35:13 +0100 (CET) Received: from smtp.corp.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com [10.5.11.15]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 8171BC052512; Sun, 16 Dec 2018 09:35:12 +0000 (UTC) Received: from [10.36.112.14] (unknown [10.36.112.14]) by smtp.corp.redhat.com (Postfix) with ESMTPS id 739085D756; Sun, 16 Dec 2018 09:35:10 +0000 (UTC) To: Xiao Wang , tiwei.bie@intel.com Cc: alejandro.lucero@netronome.com, dev@dpdk.org, zhihong.wang@intel.com, xiaolong.ye@intel.com References: <20181213100910.13087-2-xiao.w.wang@intel.com> <20181214211612.167681-1-xiao.w.wang@intel.com> <20181214211612.167681-10-xiao.w.wang@intel.com> From: Maxime Coquelin Message-ID: Date: Sun, 16 Dec 2018 10:35:07 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.2.1 MIME-Version: 1.0 In-Reply-To: <20181214211612.167681-10-xiao.w.wang@intel.com> Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit X-Scanned-By: MIMEDefang 2.79 on 10.5.11.15 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.31]); Sun, 16 Dec 2018 09:35:12 +0000 (UTC) Subject: Re: [dpdk-dev] [PATCH v4 09/10] net/ifc: support SW assisted VDPA live migration X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 16 Dec 2018 09:35:13 -0000 On 12/14/18 10:16 PM, Xiao Wang wrote: > In SW assisted live migration mode, driver will stop the device and > setup a mediate virtio ring to relay the communication between the > virtio driver and the VDPA device. > > This data path intervention will allow SW to help on guest dirty page > logging for live migration. > > This SW fallback is event driven relay thread, so when the network > throughput is low, this SW fallback will take little CPU resource, but > when the throughput goes up, the relay thread's CPU usage will goes up > accordinly. s/accordinly/accordingly/ > > User needs to take all the factors including CPU usage, guest perf > degradation, etc. into consideration when selecting the live migration > support mode. > > Signed-off-by: Xiao Wang > --- > drivers/net/ifc/base/ifcvf.h | 1 + > drivers/net/ifc/ifcvf_vdpa.c | 346 ++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 344 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/ifc/base/ifcvf.h b/drivers/net/ifc/base/ifcvf.h > index c15c69107..e8a30d2c6 100644 > --- a/drivers/net/ifc/base/ifcvf.h > +++ b/drivers/net/ifc/base/ifcvf.h > @@ -50,6 +50,7 @@ > #define IFCVF_LM_ENABLE_VF 0x1 > #define IFCVF_LM_ENABLE_PF 0x3 > #define IFCVF_LOG_BASE 0x100000000000 > +#define IFCVF_MEDIATE_VRING 0x200000000000 MEDIATED? > > #define IFCVF_32_BIT_MASK 0xffffffff > > diff --git a/drivers/net/ifc/ifcvf_vdpa.c b/drivers/net/ifc/ifcvf_vdpa.c > index f181c5a6e..61757d0b4 100644 > --- a/drivers/net/ifc/ifcvf_vdpa.c > +++ b/drivers/net/ifc/ifcvf_vdpa.c > @@ -63,6 +63,9 @@ struct ifcvf_internal { > rte_atomic32_t running; > rte_spinlock_t lock; > bool sw_lm; > + bool sw_fallback_running; > + /* mediated vring for sw fallback */ > + struct vring m_vring[IFCVF_MAX_QUEUES * 2]; > }; > > struct internal_list { > @@ -308,6 +311,9 @@ vdpa_ifcvf_stop(struct ifcvf_internal *internal) > rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, > hw->vring[i].last_used_idx); > > + if (internal->sw_lm) > + return; > + > rte_vhost_get_negotiated_features(vid, &features); > if (RTE_VHOST_NEED_LOG(features)) { > ifcvf_disable_logging(hw); > @@ -539,6 +545,318 @@ update_datapath(struct ifcvf_internal *internal) > return ret; > } > > +static int > +m_ifcvf_start(struct ifcvf_internal *internal) > +{ > + struct ifcvf_hw *hw = &internal->hw; > + uint32_t i, nr_vring; > + int vid, ret; > + struct rte_vhost_vring vq; > + void *vring_buf; > + uint64_t m_vring_iova = IFCVF_MEDIATE_VRING; > + uint64_t size; > + uint64_t gpa; > + > + vid = internal->vid; > + nr_vring = rte_vhost_get_vring_num(vid); > + rte_vhost_get_negotiated_features(vid, &hw->req_features); > + > + for (i = 0; i < nr_vring; i++) { > + rte_vhost_get_vhost_vring(vid, i, &vq); > + > + size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE), > + PAGE_SIZE); > + vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE); > + vring_init(&internal->m_vring[i], vq.size, vring_buf, > + PAGE_SIZE); > + > + ret = rte_vfio_container_dma_map(internal->vfio_container_fd, > + (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size); > + if (ret < 0) { > + DRV_LOG(ERR, "mediate vring DMA map failed."); > + goto error; > + } > + > + gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc); > + if (gpa == 0) { > + DRV_LOG(ERR, "Fail to get GPA for descriptor ring."); > + return -1; > + } > + hw->vring[i].desc = gpa; > + > + hw->vring[i].avail = m_vring_iova + > + (char *)internal->m_vring[i].avail - > + (char *)internal->m_vring[i].desc; > + > + hw->vring[i].used = m_vring_iova + > + (char *)internal->m_vring[i].used - > + (char *)internal->m_vring[i].desc; > + > + hw->vring[i].size = vq.size; > + > + rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx, > + &hw->vring[i].last_used_idx); > + > + m_vring_iova += size; > + } > + hw->nr_vring = nr_vring; > + > + return ifcvf_start_hw(&internal->hw); > + > +error: > + for (i = 0; i < nr_vring; i++) > + if (internal->m_vring[i].desc) > + rte_free(internal->m_vring[i].desc); > + > + return -1; > +} > + > +static int > +m_ifcvf_stop(struct ifcvf_internal *internal) > +{ > + int vid; > + uint32_t i; > + struct rte_vhost_vring vq; > + struct ifcvf_hw *hw = &internal->hw; > + uint64_t m_vring_iova = IFCVF_MEDIATE_VRING; > + uint64_t size, len; > + > + vid = internal->vid; > + ifcvf_stop_hw(hw); > + > + for (i = 0; i < hw->nr_vring; i++) { > + rte_vhost_get_vhost_vring(vid, i, &vq); > + len = IFCVF_USED_RING_LEN(vq.size); > + rte_vhost_log_used_vring(vid, i, 0, len); > + > + size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE), > + PAGE_SIZE); > + rte_vfio_container_dma_unmap(internal->vfio_container_fd, > + (uint64_t)(uintptr_t)internal->m_vring[i].desc, > + m_vring_iova, size); > + > + rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, > + hw->vring[i].last_used_idx); > + rte_free(internal->m_vring[i].desc); > + m_vring_iova += size; > + } > + > + return 0; > +} > + > +static int > +m_enable_vfio_intr(struct ifcvf_internal *internal) > +{ > + uint32_t nr_vring; > + struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle; > + int ret; > + > + nr_vring = rte_vhost_get_vring_num(internal->vid); > + > + ret = rte_intr_efd_enable(intr_handle, nr_vring); > + if (ret) > + return -1; > + > + ret = rte_intr_enable(intr_handle); > + if (ret) > + return -1; > + > + return 0; > +} > + > +static void > +m_disable_vfio_intr(struct ifcvf_internal *internal) > +{ > + struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle; > + > + rte_intr_efd_disable(intr_handle); > + rte_intr_disable(intr_handle); > +} > + > +static void > +update_avail_ring(struct ifcvf_internal *internal, uint16_t qid) > +{ > + rte_vdpa_relay_vring_avail(internal->vid, qid, &internal->m_vring[qid]); > + ifcvf_notify_queue(&internal->hw, qid); > +} > + > +static void > +update_used_ring(struct ifcvf_internal *internal, uint16_t qid) > +{ > + rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]); > + rte_vhost_vring_call(internal->vid, qid); > +} > + > +static void * > +vring_relay(void *arg) > +{ > + int i, vid, epfd, fd, nfds; > + struct ifcvf_internal *internal = (struct ifcvf_internal *)arg; > + struct rte_vhost_vring vring; > + struct rte_intr_handle *intr_handle; > + uint16_t qid, q_num; > + struct epoll_event events[IFCVF_MAX_QUEUES * 4]; > + struct epoll_event ev; > + int nbytes; > + uint64_t buf; > + > + vid = internal->vid; > + q_num = rte_vhost_get_vring_num(vid); > + /* prepare the mediate vring */ > + for (qid = 0; qid < q_num; qid++) { > + rte_vhost_get_vring_base(vid, qid, > + &internal->m_vring[qid].avail->idx, > + &internal->m_vring[qid].used->idx); > + rte_vdpa_relay_vring_avail(vid, qid, &internal->m_vring[qid]); > + } > + > + /* add notify fd and interrupt fd to epoll */ > + epfd = epoll_create(IFCVF_MAX_QUEUES * 2); > + if (epfd < 0) { > + DRV_LOG(ERR, "failed to create epoll instance."); > + return NULL; > + } > + internal->epfd = epfd; > + > + for (qid = 0; qid < q_num; qid++) { > + ev.events = EPOLLIN | EPOLLPRI; > + rte_vhost_get_vhost_vring(vid, qid, &vring); > + ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32; > + if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) { > + DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); > + return NULL; > + } > + } > + > + intr_handle = &internal->pdev->intr_handle; > + for (qid = 0; qid < q_num; qid++) { > + ev.events = EPOLLIN | EPOLLPRI; > + ev.data.u64 = 1 | qid << 1 | > + (uint64_t)intr_handle->efds[qid] << 32; > + if (epoll_ctl(epfd, EPOLL_CTL_ADD, intr_handle->efds[qid], &ev) > + < 0) { > + DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); > + return NULL; > + } > + } > + > + /* start relay with a first kick */ > + for (qid = 0; qid < q_num; qid++) > + ifcvf_notify_queue(&internal->hw, qid); > + > + /* listen to the events and react accordingly */ > + for (;;) { > + nfds = epoll_wait(epfd, events, q_num * 2, -1); > + if (nfds < 0) { > + if (errno == EINTR) > + continue; > + DRV_LOG(ERR, "epoll_wait return fail\n"); > + return NULL; > + } > + > + for (i = 0; i < nfds; i++) { > + fd = (uint32_t)(events[i].data.u64 >> 32); > + do { > + nbytes = read(fd, &buf, 8); > + if (nbytes < 0) { > + if (errno == EINTR || > + errno == EWOULDBLOCK || > + errno == EAGAIN) > + continue; > + DRV_LOG(INFO, "Error reading " > + "kickfd: %s", > + strerror(errno)); > + } > + break; > + } while (1); > + > + qid = events[i].data.u32 >> 1; > + > + if (events[i].data.u32 & 1) > + update_used_ring(internal, qid); > + else > + update_avail_ring(internal, qid); > + } > + } > + > + return NULL; > +} > + > +static int > +setup_vring_relay(struct ifcvf_internal *internal) > +{ > + int ret; > + > + ret = pthread_create(&internal->tid, NULL, vring_relay, > + (void *)internal); So it will be scheduled without any affinity? Shouldn't it use a pmd thread instead? > + if (ret) { > + DRV_LOG(ERR, "failed to create ring relay pthread."); > + return -1; > + } > + return 0; > +} > + > +static int > +unset_vring_relay(struct ifcvf_internal *internal) > +{ > + void *status; > + > + if (internal->tid) { > + pthread_cancel(internal->tid); > + pthread_join(internal->tid, &status); > + } > + internal->tid = 0; > + > + if (internal->epfd >= 0) > + close(internal->epfd); > + internal->epfd = -1; > + > + return 0; > +} > + > +static int > +ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal) > +{ > + int ret; > + > + /* stop the direct IO data path */ > + unset_notify_relay(internal); > + vdpa_ifcvf_stop(internal); > + vdpa_disable_vfio_intr(internal); > + > + ret = rte_vhost_host_notifier_ctrl(internal->vid, false); > + if (ret && ret != -ENOTSUP) > + goto error; > + > + /* set up interrupt for interrupt relay */ > + ret = m_enable_vfio_intr(internal); > + if (ret) > + goto unmap; > + > + /* config the VF */ > + ret = m_ifcvf_start(internal); > + if (ret) > + goto unset_intr; > + > + /* set up vring relay thread */ > + ret = setup_vring_relay(internal); > + if (ret) > + goto stop_vf; > + > + internal->sw_fallback_running = true; > + > + return 0; > + > +stop_vf: > + m_ifcvf_stop(internal); > +unset_intr: > + m_disable_vfio_intr(internal); > +unmap: > + ifcvf_dma_map(internal, 0); > +error: > + return -1; > +} > + > static int > ifcvf_dev_config(int vid) > { > @@ -579,8 +897,25 @@ ifcvf_dev_close(int vid) > } > > internal = list->internal; > - rte_atomic32_set(&internal->dev_attached, 0); > - update_datapath(internal); > + > + if (internal->sw_fallback_running) { > + /* unset ring relay */ > + unset_vring_relay(internal); > + > + /* reset VF */ > + m_ifcvf_stop(internal); > + > + /* remove interrupt setting */ > + m_disable_vfio_intr(internal); > + > + /* unset DMA map for guest memory */ > + ifcvf_dma_map(internal, 0); > + > + internal->sw_fallback_running = false; > + } else { > + rte_atomic32_set(&internal->dev_attached, 0); > + update_datapath(internal); > + } > > return 0; > } > @@ -604,7 +939,12 @@ ifcvf_set_features(int vid) > internal = list->internal; > rte_vhost_get_negotiated_features(vid, &features); > > - if (RTE_VHOST_NEED_LOG(features)) { > + if (!RTE_VHOST_NEED_LOG(features)) > + return 0; > + > + if (internal->sw_lm) { > + ifcvf_sw_fallback_switchover(internal); > + } else { > rte_vhost_get_log_base(vid, &log_base, &log_size); > rte_vfio_container_dma_map(internal->vfio_container_fd, > log_base, IFCVF_LOG_BASE, log_size); >