From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga06.intel.com (mga06.intel.com [134.134.136.31]) by dpdk.org (Postfix) with ESMTP id 71FBE235 for ; Fri, 24 Nov 2017 07:04:59 +0100 (CET) Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by orsmga104.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 23 Nov 2017 22:04:57 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.44,445,1505804400"; d="scan'208";a="1247871145" Received: from deepin-15.sh.intel.com (HELO deepin-15.5-oivbkq) ([10.67.104.165]) by fmsmga002.fm.intel.com with ESMTP; 23 Nov 2017 22:04:56 -0800 Date: Fri, 24 Nov 2017 14:04:32 +0800 From: Tiwei Bie To: Xiao Wang Cc: dev@dpdk.org, yliu@fridaylinux.org Message-ID: <20171124060432.wi5mducdymweuk7u@deepin-15.5-oivbkq> References: <1511521440-57724-1-git-send-email-xiao.w.wang@intel.com> <1511521440-57724-3-git-send-email-xiao.w.wang@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <1511521440-57724-3-git-send-email-xiao.w.wang@intel.com> User-Agent: NeoMutt/20170609 (1.8.3) Subject: Re: [dpdk-dev] [PATCH 2/2] net/virtio: support GUEST ANNOUNCE X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 24 Nov 2017 06:05:00 -0000 Hi, Some quick comments. Will go through the whole patch later. On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote: > When live migration is done, for the backup VM, either the virtio > frontend or the vhost backend needs to send out gratuitous RARP packet > to announce its new network location. > > This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support live > migration scenario where the vhost backend doesn't have the ability to > generate RARP packet. > > Brief introduction of the work flow: > 1. QEMU finishes live migration, pokes the backup VM with an interrupt. > 2. Virtio interrupt handler reads out the interrupt status value, and > realizes it needs to send out RARP packet to announce its location. > 3. Pause device to stop worker thread touching the queues. > 4. Inject a RARP packet into a Tx Queue. > 5. Ack the interrupt via control queue. > 6. Resume device to continue packet processing. > > Signed-off-by: Xiao Wang > --- > drivers/net/virtio/virtio_ethdev.c | 131 ++++++++++++++++++++++++++++++++++++- > drivers/net/virtio/virtio_ethdev.h | 4 ++ > drivers/net/virtio/virtio_pci.h | 1 + > drivers/net/virtio/virtio_rxtx.c | 81 +++++++++++++++++++++++ > drivers/net/virtio/virtqueue.h | 11 ++++ > 5 files changed, 226 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c > index 1959b11..6eaea0e 100644 > --- a/drivers/net/virtio/virtio_ethdev.c > +++ b/drivers/net/virtio/virtio_ethdev.c > @@ -48,6 +48,8 @@ > #include > #include > #include > +#include > +#include > #include > #include > #include > @@ -55,6 +57,7 @@ > #include > #include > #include > +#include > > #include "virtio_ethdev.h" > #include "virtio_pci.h" > @@ -106,6 +109,13 @@ static int virtio_dev_queue_stats_mapping_set( > uint8_t stat_idx, > uint8_t is_rx); > > +static int make_rarp_packet(struct rte_mbuf *rarp_mbuf, > + const struct ether_addr *mac); > +static int virtio_dev_pause(struct rte_eth_dev *dev); > +static void virtio_dev_resume(struct rte_eth_dev *dev); > +static void generate_rarp(struct rte_eth_dev *dev); > +static void virtnet_ack_link_announce(struct rte_eth_dev *dev); > + > /* > * The set of PCI devices this driver supports > */ > @@ -1249,9 +1259,116 @@ static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev, > return 0; > } > > +#define RARP_PKT_SIZE 64 > + > +static int > +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) > +{ > + struct ether_hdr *eth_hdr; > + struct arp_hdr *rarp; > + > + if (rarp_mbuf->buf_len < RARP_PKT_SIZE) { > + PMD_DRV_LOG(ERR, "mbuf size too small %u (< %d)", > + rarp_mbuf->buf_len, RARP_PKT_SIZE); > + return -1; > + } > + > + /* Ethernet header. */ > + eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); You can use rte_pktmbuf_mtod() directly. > + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); > + ether_addr_copy(mac, ð_hdr->s_addr); > + eth_hdr->ether_type = htons(ETHER_TYPE_RARP); > + > + /* RARP header. */ > + rarp = (struct arp_hdr *)(eth_hdr + 1); > + rarp->arp_hrd = htons(ARP_HRD_ETHER); > + rarp->arp_pro = htons(ETHER_TYPE_IPv4); > + rarp->arp_hln = ETHER_ADDR_LEN; > + rarp->arp_pln = 4; > + rarp->arp_op = htons(ARP_OP_REVREQUEST); > + > + ether_addr_copy(mac, &rarp->arp_data.arp_sha); > + ether_addr_copy(mac, &rarp->arp_data.arp_tha); > + memset(&rarp->arp_data.arp_sip, 0x00, 4); > + memset(&rarp->arp_data.arp_tip, 0x00, 4); > + > + rarp_mbuf->data_len = RARP_PKT_SIZE; > + rarp_mbuf->pkt_len = RARP_PKT_SIZE; > + > + return 0; > +} > + > +static int > +virtio_dev_pause(struct rte_eth_dev *dev) > +{ > + struct virtio_hw *hw = dev->data->dev_private; > + > + if (hw->started == 0) > + return -1; > + hw->started = 0; > + /* > + * Prevent the worker thread from touching queues to avoid condition, > + * 1 ms should be enough for the ongoing Tx function to finish. > + */ > + rte_delay_ms(1); > + return 0; > +} > + > +static void > +virtio_dev_resume(struct rte_eth_dev *dev) > +{ > + struct virtio_hw *hw = dev->data->dev_private; > + > + hw->started = 1; > +} > + > +static void > +generate_rarp(struct rte_eth_dev *dev) > +{ > + struct virtio_hw *hw = dev->data->dev_private; > + struct rte_mbuf *rarp_mbuf = NULL; > + struct virtnet_tx *txvq = dev->data->tx_queues[0]; > + struct virtnet_rx *rxvq = dev->data->rx_queues[0]; > + > + rarp_mbuf = rte_mbuf_raw_alloc(rxvq->mpool); > + if (rarp_mbuf == NULL) { > + PMD_DRV_LOG(ERR, "mbuf allocate failed"); > + return; > + } > + > + if (make_rarp_packet(rarp_mbuf, (struct ether_addr *)hw->mac_addr)) { > + rte_pktmbuf_free(rarp_mbuf); > + rarp_mbuf = NULL; > + return; > + } > + > + /* If virtio port just stopped, no need to send RARP */ > + if (virtio_dev_pause(dev) < -1) You mean < 0? > + return; > + > + virtio_inject_pkts(txvq, &rarp_mbuf, 1); > + /* Recover the stored hw status to let worker thread continue */ > + virtio_dev_resume(dev); > +} > + > +static void > +virtnet_ack_link_announce(struct rte_eth_dev *dev) > +{ > + struct virtio_hw *hw = dev->data->dev_private; > + struct virtio_pmd_ctrl ctrl; > + int len; > + > + ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE; > + ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK; > + len = 0; > + > + virtio_send_command(hw->cvq, &ctrl, &len, 0); > +} > + > /* > - * Process Virtio Config changed interrupt and call the callback > - * if link state changed. > + * Process virtio config changed interrupt. Call the callback > + * if link state changed; generate gratuitous RARP packet if > + * the status indicates an ANNOUNCE. > */ > void > virtio_interrupt_handler(void *param) > @@ -1274,6 +1391,12 @@ static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev, > NULL, NULL); > } > > + if (isr & VIRTIO_NET_S_ANNOUNCE) { > + rte_spinlock_lock(&hw->sl); > + generate_rarp(dev); > + virtnet_ack_link_announce(dev); > + rte_spinlock_unlock(&hw->sl); > + } > } > > /* set rx and tx handlers according to what is supported */ > @@ -1786,6 +1909,8 @@ static int eth_virtio_pci_remove(struct rte_pci_device *pci_dev) > return -EBUSY; > } > > + rte_spinlock_init(&hw->sl); > + > hw->use_simple_rx = 1; > hw->use_simple_tx = 1; > > @@ -1952,12 +2077,14 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev *dev) > > PMD_INIT_LOG(DEBUG, "stop"); > > + rte_spinlock_lock(&hw->sl); > if (intr_conf->lsc || intr_conf->rxq) > virtio_intr_disable(dev); > > hw->started = 0; > memset(&link, 0, sizeof(link)); > virtio_dev_atomic_write_link_status(dev, &link); > + rte_spinlock_unlock(&hw->sl); > } > > static int > diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h > index 2039bc5..24271cb 100644 > --- a/drivers/net/virtio/virtio_ethdev.h > +++ b/drivers/net/virtio/virtio_ethdev.h > @@ -67,6 +67,7 @@ > 1u << VIRTIO_NET_F_HOST_TSO6 | \ > 1u << VIRTIO_NET_F_MRG_RXBUF | \ > 1u << VIRTIO_NET_F_MTU | \ > + 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE | \ > 1u << VIRTIO_RING_F_INDIRECT_DESC | \ > 1ULL << VIRTIO_F_VERSION_1 | \ > 1ULL << VIRTIO_F_IOMMU_PLATFORM) > @@ -111,6 +112,9 @@ uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, > uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, > uint16_t nb_pkts); > > +uint16_t virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, > + uint16_t nb_pkts); > + > uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, > uint16_t nb_pkts); > > diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h > index 3c5ce66..3cd367e 100644 > --- a/drivers/net/virtio/virtio_pci.h > +++ b/drivers/net/virtio/virtio_pci.h > @@ -270,6 +270,7 @@ struct virtio_hw { > struct virtio_pci_common_cfg *common_cfg; > struct virtio_net_config *dev_cfg; > void *virtio_user_dev; > + rte_spinlock_t sl; Need to add some detailed comments to describe what's protected by this lock. > > struct virtqueue **vqs; > }; > diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c > index 6a24fde..7313bdd 100644 > --- a/drivers/net/virtio/virtio_rxtx.c > +++ b/drivers/net/virtio/virtio_rxtx.c > @@ -1100,3 +1100,84 @@ > > return nb_tx; > } > + > +uint16_t > +virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) > +{ > + struct virtnet_tx *txvq = tx_queue; > + struct virtqueue *vq = txvq->vq; > + struct virtio_hw *hw = vq->hw; > + uint16_t hdr_size = hw->vtnet_hdr_size; > + uint16_t nb_used, nb_tx = 0; > + > + if (unlikely(nb_pkts < 1)) > + return nb_pkts; > + > + PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); > + nb_used = VIRTQUEUE_NUSED(vq); > + > + virtio_rmb(); > + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) > + virtio_xmit_cleanup(vq, nb_used); > + > + for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) { > + struct rte_mbuf *txm = tx_pkts[nb_tx]; > + int can_push = 0, use_indirect = 0, slots, need; > + > + /* optimize ring usage */ > + if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) || > + vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) && > + rte_mbuf_refcnt_read(txm) == 1 && > + RTE_MBUF_DIRECT(txm) && > + txm->nb_segs == 1 && > + rte_pktmbuf_headroom(txm) >= hdr_size && > + rte_is_aligned(rte_pktmbuf_mtod(txm, char *), > + __alignof__(struct virtio_net_hdr_mrg_rxbuf))) > + can_push = 1; > + else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) && > + txm->nb_segs < VIRTIO_MAX_TX_INDIRECT) > + use_indirect = 1; > + > + /* How many main ring entries are needed to this Tx? > + * any_layout => number of segments > + * indirect => 1 > + * default => number of segments + 1 > + */ > + slots = use_indirect ? 1 : (txm->nb_segs + !can_push); > + need = slots - vq->vq_free_cnt; > + > + /* Positive value indicates it need free vring descriptors */ > + if (unlikely(need > 0)) { > + nb_used = VIRTQUEUE_NUSED(vq); > + virtio_rmb(); > + need = RTE_MIN(need, (int)nb_used); > + > + virtio_xmit_cleanup(vq, need); > + need = slots - vq->vq_free_cnt; > + if (unlikely(need > 0)) { > + PMD_TX_LOG(ERR, > + "No free tx descriptors to transmit"); > + break; > + } > + } > + > + /* Enqueue Packet buffers */ > + virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push); > + > + txvq->stats.bytes += txm->pkt_len; > + virtio_update_packet_stats(&txvq->stats, txm); > + } > + > + txvq->stats.packets += nb_tx; > + > + if (likely(nb_tx)) { > + vq_update_avail_idx(vq); > + > + if (unlikely(virtqueue_kick_prepare(vq))) { > + virtqueue_notify(vq); > + PMD_TX_LOG(DEBUG, "Notified backend after xmit"); > + } > + } > + > + return nb_tx; > +} What's the difference between virtio_inject_pkts() and virtio_xmit_pkts() except the latter will check hw->started? Best regards, Tiwei Bie