From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <maxime.coquelin@redhat.com>
Received: from mx1.redhat.com (mx1.redhat.com [209.132.183.28])
 by dpdk.org (Postfix) with ESMTP id 4019A2F42
 for <dev@dpdk.org>; Sun, 16 Dec 2018 10:35:13 +0100 (CET)
Received: from smtp.corp.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com
 [10.5.11.15])
 (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits))
 (No client certificate requested)
 by mx1.redhat.com (Postfix) with ESMTPS id 8171BC052512;
 Sun, 16 Dec 2018 09:35:12 +0000 (UTC)
Received: from [10.36.112.14] (unknown [10.36.112.14])
 by smtp.corp.redhat.com (Postfix) with ESMTPS id 739085D756;
 Sun, 16 Dec 2018 09:35:10 +0000 (UTC)
To: Xiao Wang <xiao.w.wang@intel.com>, tiwei.bie@intel.com
Cc: alejandro.lucero@netronome.com, dev@dpdk.org, zhihong.wang@intel.com,
 xiaolong.ye@intel.com
References: <20181213100910.13087-2-xiao.w.wang@intel.com>
 <20181214211612.167681-1-xiao.w.wang@intel.com>
 <20181214211612.167681-10-xiao.w.wang@intel.com>
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-ID: <fee047d7-8314-e072-0f98-69faa4a5bd48@redhat.com>
Date: Sun, 16 Dec 2018 10:35:07 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101
 Thunderbird/60.2.1
MIME-Version: 1.0
In-Reply-To: <20181214211612.167681-10-xiao.w.wang@intel.com>
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Language: en-US
Content-Transfer-Encoding: 7bit
X-Scanned-By: MIMEDefang 2.79 on 10.5.11.15
X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16
 (mx1.redhat.com [10.5.110.31]); Sun, 16 Dec 2018 09:35:12 +0000 (UTC)
Subject: Re: [dpdk-dev] [PATCH v4 09/10] net/ifc: support SW assisted VDPA
	live migration
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Sun, 16 Dec 2018 09:35:13 -0000


On 12/14/18 10:16 PM, Xiao Wang wrote:
> In SW assisted live migration mode, driver will stop the device and
> setup a mediate virtio ring to relay the communication between the
> virtio driver and the VDPA device.
> 
> This data path intervention will allow SW to help on guest dirty page
> logging for live migration.
> 
> This SW fallback is event driven relay thread, so when the network
> throughput is low, this SW fallback will take little CPU resource, but
> when the throughput goes up, the relay thread's CPU usage will goes up
> accordinly.

s/accordinly/accordingly/

> 
> User needs to take all the factors including CPU usage, guest perf
> degradation, etc. into consideration when selecting the live migration
> support mode.
> 
> Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> ---
>   drivers/net/ifc/base/ifcvf.h |   1 +
>   drivers/net/ifc/ifcvf_vdpa.c | 346 ++++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 344 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ifc/base/ifcvf.h b/drivers/net/ifc/base/ifcvf.h
> index c15c69107..e8a30d2c6 100644
> --- a/drivers/net/ifc/base/ifcvf.h
> +++ b/drivers/net/ifc/base/ifcvf.h
> @@ -50,6 +50,7 @@
>   #define IFCVF_LM_ENABLE_VF		0x1
>   #define IFCVF_LM_ENABLE_PF		0x3
>   #define IFCVF_LOG_BASE			0x100000000000
> +#define IFCVF_MEDIATE_VRING		0x200000000000

MEDIATED?

>   
>   #define IFCVF_32_BIT_MASK		0xffffffff
>   
> diff --git a/drivers/net/ifc/ifcvf_vdpa.c b/drivers/net/ifc/ifcvf_vdpa.c
> index f181c5a6e..61757d0b4 100644
> --- a/drivers/net/ifc/ifcvf_vdpa.c
> +++ b/drivers/net/ifc/ifcvf_vdpa.c
> @@ -63,6 +63,9 @@ struct ifcvf_internal {
>   	rte_atomic32_t running;
>   	rte_spinlock_t lock;
>   	bool sw_lm;
> +	bool sw_fallback_running;
> +	/* mediated vring for sw fallback */
> +	struct vring m_vring[IFCVF_MAX_QUEUES * 2];
>   };
>   
>   struct internal_list {
> @@ -308,6 +311,9 @@ vdpa_ifcvf_stop(struct ifcvf_internal *internal)
>   		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
>   				hw->vring[i].last_used_idx);
>   
> +	if (internal->sw_lm)
> +		return;
> +
>   	rte_vhost_get_negotiated_features(vid, &features);
>   	if (RTE_VHOST_NEED_LOG(features)) {
>   		ifcvf_disable_logging(hw);
> @@ -539,6 +545,318 @@ update_datapath(struct ifcvf_internal *internal)
>   	return ret;
>   }
>   
> +static int
> +m_ifcvf_start(struct ifcvf_internal *internal)
> +{
> +	struct ifcvf_hw *hw = &internal->hw;
> +	uint32_t i, nr_vring;
> +	int vid, ret;
> +	struct rte_vhost_vring vq;
> +	void *vring_buf;
> +	uint64_t m_vring_iova = IFCVF_MEDIATE_VRING;
> +	uint64_t size;
> +	uint64_t gpa;
> +
> +	vid = internal->vid;
> +	nr_vring = rte_vhost_get_vring_num(vid);
> +	rte_vhost_get_negotiated_features(vid, &hw->req_features);
> +
> +	for (i = 0; i < nr_vring; i++) {
> +		rte_vhost_get_vhost_vring(vid, i, &vq);
> +
> +		size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
> +				PAGE_SIZE);
> +		vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
> +		vring_init(&internal->m_vring[i], vq.size, vring_buf,
> +				PAGE_SIZE);
> +
> +		ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
> +			(uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
> +		if (ret < 0) {
> +			DRV_LOG(ERR, "mediate vring DMA map failed.");
> +			goto error;
> +		}
> +
> +		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
> +		if (gpa == 0) {
> +			DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
> +			return -1;
> +		}
> +		hw->vring[i].desc = gpa;
> +
> +		hw->vring[i].avail = m_vring_iova +
> +			(char *)internal->m_vring[i].avail -
> +			(char *)internal->m_vring[i].desc;
> +
> +		hw->vring[i].used = m_vring_iova +
> +			(char *)internal->m_vring[i].used -
> +			(char *)internal->m_vring[i].desc;
> +
> +		hw->vring[i].size = vq.size;
> +
> +		rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
> +				&hw->vring[i].last_used_idx);
> +
> +		m_vring_iova += size;
> +	}
> +	hw->nr_vring = nr_vring;
> +
> +	return ifcvf_start_hw(&internal->hw);
> +
> +error:
> +	for (i = 0; i < nr_vring; i++)
> +		if (internal->m_vring[i].desc)
> +			rte_free(internal->m_vring[i].desc);
> +
> +	return -1;
> +}
> +
> +static int
> +m_ifcvf_stop(struct ifcvf_internal *internal)
> +{
> +	int vid;
> +	uint32_t i;
> +	struct rte_vhost_vring vq;
> +	struct ifcvf_hw *hw = &internal->hw;
> +	uint64_t m_vring_iova = IFCVF_MEDIATE_VRING;
> +	uint64_t size, len;
> +
> +	vid = internal->vid;
> +	ifcvf_stop_hw(hw);
> +
> +	for (i = 0; i < hw->nr_vring; i++) {
> +		rte_vhost_get_vhost_vring(vid, i, &vq);
> +		len = IFCVF_USED_RING_LEN(vq.size);
> +		rte_vhost_log_used_vring(vid, i, 0, len);
> +
> +		size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
> +				PAGE_SIZE);
> +		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
> +			(uint64_t)(uintptr_t)internal->m_vring[i].desc,
> +			m_vring_iova, size);
> +
> +		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
> +				hw->vring[i].last_used_idx);
> +		rte_free(internal->m_vring[i].desc);
> +		m_vring_iova += size;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +m_enable_vfio_intr(struct ifcvf_internal *internal)
> +{
> +	uint32_t nr_vring;
> +	struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
> +	int ret;
> +
> +	nr_vring = rte_vhost_get_vring_num(internal->vid);
> +
> +	ret = rte_intr_efd_enable(intr_handle, nr_vring);
> +	if (ret)
> +		return -1;
> +
> +	ret = rte_intr_enable(intr_handle);
> +	if (ret)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static void
> +m_disable_vfio_intr(struct ifcvf_internal *internal)
> +{
> +	struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
> +
> +	rte_intr_efd_disable(intr_handle);
> +	rte_intr_disable(intr_handle);
> +}
> +
> +static void
> +update_avail_ring(struct ifcvf_internal *internal, uint16_t qid)
> +{
> +	rte_vdpa_relay_vring_avail(internal->vid, qid, &internal->m_vring[qid]);
> +	ifcvf_notify_queue(&internal->hw, qid);
> +}
> +
> +static void
> +update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
> +{
> +	rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
> +	rte_vhost_vring_call(internal->vid, qid);
> +}
> +
> +static void *
> +vring_relay(void *arg)
> +{
> +	int i, vid, epfd, fd, nfds;
> +	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
> +	struct rte_vhost_vring vring;
> +	struct rte_intr_handle *intr_handle;
> +	uint16_t qid, q_num;
> +	struct epoll_event events[IFCVF_MAX_QUEUES * 4];
> +	struct epoll_event ev;
> +	int nbytes;
> +	uint64_t buf;
> +
> +	vid = internal->vid;
> +	q_num = rte_vhost_get_vring_num(vid);
> +	/* prepare the mediate vring */
> +	for (qid = 0; qid < q_num; qid++) {
> +		rte_vhost_get_vring_base(vid, qid,
> +				&internal->m_vring[qid].avail->idx,
> +				&internal->m_vring[qid].used->idx);
> +		rte_vdpa_relay_vring_avail(vid, qid, &internal->m_vring[qid]);
> +	}
> +
> +	/* add notify fd and interrupt fd to epoll */
> +	epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
> +	if (epfd < 0) {
> +		DRV_LOG(ERR, "failed to create epoll instance.");
> +		return NULL;
> +	}
> +	internal->epfd = epfd;
> +
> +	for (qid = 0; qid < q_num; qid++) {
> +		ev.events = EPOLLIN | EPOLLPRI;
> +		rte_vhost_get_vhost_vring(vid, qid, &vring);
> +		ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
> +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
> +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> +			return NULL;
> +		}
> +	}
> +
> +	intr_handle = &internal->pdev->intr_handle;
> +	for (qid = 0; qid < q_num; qid++) {
> +		ev.events = EPOLLIN | EPOLLPRI;
> +		ev.data.u64 = 1 | qid << 1 |
> +			(uint64_t)intr_handle->efds[qid] << 32;
> +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, intr_handle->efds[qid], &ev)
> +				< 0) {
> +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> +			return NULL;
> +		}
> +	}
> +
> +	/* start relay with a first kick */
> +	for (qid = 0; qid < q_num; qid++)
> +		ifcvf_notify_queue(&internal->hw, qid);
> +
> +	/* listen to the events and react accordingly */
> +	for (;;) {
> +		nfds = epoll_wait(epfd, events, q_num * 2, -1);
> +		if (nfds < 0) {
> +			if (errno == EINTR)
> +				continue;
> +			DRV_LOG(ERR, "epoll_wait return fail\n");
> +			return NULL;
> +		}
> +
> +		for (i = 0; i < nfds; i++) {
> +			fd = (uint32_t)(events[i].data.u64 >> 32);
> +			do {
> +				nbytes = read(fd, &buf, 8);
> +				if (nbytes < 0) {
> +					if (errno == EINTR ||
> +					    errno == EWOULDBLOCK ||
> +					    errno == EAGAIN)
> +						continue;
> +					DRV_LOG(INFO, "Error reading "
> +						"kickfd: %s",
> +						strerror(errno));
> +				}
> +				break;
> +			} while (1);
> +
> +			qid = events[i].data.u32 >> 1;
> +
> +			if (events[i].data.u32 & 1)
> +				update_used_ring(internal, qid);
> +			else
> +				update_avail_ring(internal, qid);
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static int
> +setup_vring_relay(struct ifcvf_internal *internal)
> +{
> +	int ret;
> +
> +	ret = pthread_create(&internal->tid, NULL, vring_relay,
> +			(void *)internal);

So it will be scheduled without any affinity?
Shouldn't it use a pmd thread instead?

> +	if (ret) {
> +		DRV_LOG(ERR, "failed to create ring relay pthread.");
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +static int
> +unset_vring_relay(struct ifcvf_internal *internal)
> +{
> +	void *status;
> +
> +	if (internal->tid) {
> +		pthread_cancel(internal->tid);
> +		pthread_join(internal->tid, &status);
> +	}
> +	internal->tid = 0;
> +
> +	if (internal->epfd >= 0)
> +		close(internal->epfd);
> +	internal->epfd = -1;
> +
> +	return 0;
> +}
> +
> +static int
> +ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
> +{
> +	int ret;
> +
> +	/* stop the direct IO data path */
> +	unset_notify_relay(internal);
> +	vdpa_ifcvf_stop(internal);
> +	vdpa_disable_vfio_intr(internal);
> +
> +	ret = rte_vhost_host_notifier_ctrl(internal->vid, false);
> +	if (ret && ret != -ENOTSUP)
> +		goto error;
> +
> +	/* set up interrupt for interrupt relay */
> +	ret = m_enable_vfio_intr(internal);
> +	if (ret)
> +		goto unmap;
> +
> +	/* config the VF */
> +	ret = m_ifcvf_start(internal);
> +	if (ret)
> +		goto unset_intr;
> +
> +	/* set up vring relay thread */
> +	ret = setup_vring_relay(internal);
> +	if (ret)
> +		goto stop_vf;
> +
> +	internal->sw_fallback_running = true;
> +
> +	return 0;
> +
> +stop_vf:
> +	m_ifcvf_stop(internal);
> +unset_intr:
> +	m_disable_vfio_intr(internal);
> +unmap:
> +	ifcvf_dma_map(internal, 0);
> +error:
> +	return -1;
> +}
> +
>   static int
>   ifcvf_dev_config(int vid)
>   {
> @@ -579,8 +897,25 @@ ifcvf_dev_close(int vid)
>   	}
>   
>   	internal = list->internal;
> -	rte_atomic32_set(&internal->dev_attached, 0);
> -	update_datapath(internal);
> +
> +	if (internal->sw_fallback_running) {
> +		/* unset ring relay */
> +		unset_vring_relay(internal);
> +
> +		/* reset VF */
> +		m_ifcvf_stop(internal);
> +
> +		/* remove interrupt setting */
> +		m_disable_vfio_intr(internal);
> +
> +		/* unset DMA map for guest memory */
> +		ifcvf_dma_map(internal, 0);
> +
> +		internal->sw_fallback_running = false;
> +	} else {
> +		rte_atomic32_set(&internal->dev_attached, 0);
> +		update_datapath(internal);
> +	}
>   
>   	return 0;
>   }
> @@ -604,7 +939,12 @@ ifcvf_set_features(int vid)
>   	internal = list->internal;
>   	rte_vhost_get_negotiated_features(vid, &features);
>   
> -	if (RTE_VHOST_NEED_LOG(features)) {
> +	if (!RTE_VHOST_NEED_LOG(features))
> +		return 0;
> +
> +	if (internal->sw_lm) {
> +		ifcvf_sw_fallback_switchover(internal);
> +	} else {
>   		rte_vhost_get_log_base(vid, &log_base, &log_size);
>   		rte_vfio_container_dma_map(internal->vfio_container_fd,
>   				log_base, IFCVF_LOG_BASE, log_size);
>