From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pa0-f54.google.com (mail-pa0-f54.google.com [209.85.220.54]) by dpdk.org (Postfix) with ESMTP id 72543C2DC for ; Tue, 21 Apr 2015 19:32:47 +0200 (CEST) Received: by pabsx10 with SMTP id sx10so247259708pab.3 for ; Tue, 21 Apr 2015 10:32:46 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references; bh=PEvoSbJ1JlKL8YHqrNgX3qDXU339/tvQHHXjeK7/a2E=; b=NC1yo7VeH1iaoJr0/PhHaawOthZ57j6VjeCVw8Dgz/XZ8eNR+zoZRuyPaORnDUNuyZ 8NDPhPpIfwWqB1mWUD++k0J6Iw//hKjlRtfy31pchHAgUn2YyLFn3TTK0j8d4xCVL4d1 xJni6IKxFIO9rMQJplFD8/9OQ9MZwVXpnu61mRdT1/gckeccT+U4WVKOdShCKZ4J6/KA PDUEFSQXwJKyoeP24aJO0lVxDHHafzsIKTVbXqgyg+EndviBn+WhHO3ow3MxSJuhxJka 8dRALaVWy6EzvCv2iIKjeqRlDppntplklEj25p1Oln9TWAi2NGIRvxsluUv0WRTcedQb H6fw== X-Gm-Message-State: ALoCoQn/qo+sP11nRimERmD7q4wSYIIQ7JXg3tvzIe0609c7ZJtsu0XtfcpH/p9f/C4A8oJGVjBO X-Received: by 10.68.132.169 with SMTP id ov9mr7129503pbb.109.1429637566817; Tue, 21 Apr 2015 10:32:46 -0700 (PDT) Received: from urahara.home.lan (static-50-53-82-155.bvtn.or.frontiernet.net. [50.53.82.155]) by mx.google.com with ESMTPSA id qz3sm2674040pab.13.2015.04.21.10.32.45 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Tue, 21 Apr 2015 10:32:46 -0700 (PDT) From: Stephen Hemminger To: alexmay@microsoft.com Date: Tue, 21 Apr 2015 10:32:42 -0700 Message-Id: <1429637564-5656-6-git-send-email-stephen@networkplumber.org> X-Mailer: git-send-email 2.1.4 In-Reply-To: <1429637564-5656-1-git-send-email-stephen@networkplumber.org> References: <1429637564-5656-1-git-send-email-stephen@networkplumber.org> Cc: dev@dpdk.org, Stas Egorov , Stephen Hemminger Subject: [dpdk-dev] [PATCH v4 5/7] hv: poll mode driver X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 21 Apr 2015 17:32:48 -0000 From: Stephen Hemminger This is new Poll Mode driver for using hyper-v virtual network interface. Signed-off-by: Stas Egorov Signed-off-by: Stephen Hemminger --- lib/Makefile | 1 + lib/librte_pmd_hyperv/Makefile | 28 + lib/librte_pmd_hyperv/hyperv.h | 169 ++++ lib/librte_pmd_hyperv/hyperv_drv.c | 1653 +++++++++++++++++++++++++++++++++ lib/librte_pmd_hyperv/hyperv_drv.h | 558 +++++++++++ lib/librte_pmd_hyperv/hyperv_ethdev.c | 332 +++++++ lib/librte_pmd_hyperv/hyperv_logs.h | 69 ++ lib/librte_pmd_hyperv/hyperv_rxtx.c | 403 ++++++++ lib/librte_pmd_hyperv/hyperv_rxtx.h | 35 + mk/rte.app.mk | 4 + 10 files changed, 3252 insertions(+) create mode 100644 lib/librte_pmd_hyperv/Makefile create mode 100644 lib/librte_pmd_hyperv/hyperv.h create mode 100644 lib/librte_pmd_hyperv/hyperv_drv.c create mode 100644 lib/librte_pmd_hyperv/hyperv_drv.h create mode 100644 lib/librte_pmd_hyperv/hyperv_ethdev.c create mode 100644 lib/librte_pmd_hyperv/hyperv_logs.h create mode 100644 lib/librte_pmd_hyperv/hyperv_rxtx.c create mode 100644 lib/librte_pmd_hyperv/hyperv_rxtx.h diff --git a/lib/Makefile b/lib/Makefile index d94355d..6c1daf2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -47,6 +47,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += librte_pmd_i40e DIRS-$(CONFIG_RTE_LIBRTE_FM10K_PMD) += librte_pmd_fm10k DIRS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += librte_pmd_mlx4 DIRS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += librte_pmd_enic +DIRS-$(CONFIG_RTE_LIBRTE_HV_PMD) += librte_pmd_hyperv DIRS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += librte_pmd_bond DIRS-$(CONFIG_RTE_LIBRTE_PMD_RING) += librte_pmd_ring DIRS-$(CONFIG_RTE_LIBRTE_PMD_PCAP) += librte_pmd_pcap diff --git a/lib/librte_pmd_hyperv/Makefile b/lib/librte_pmd_hyperv/Makefile new file mode 100644 index 0000000..4ba08c8 --- /dev/null +++ b/lib/librte_pmd_hyperv/Makefile @@ -0,0 +1,28 @@ +# BSD LICENSE +# +# Copyright(c) 2013-2015 Brocade Communications Systems, Inc. +# All rights reserved. + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_pmd_hyperv.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_HV_PMD) += hyperv_ethdev.c +SRCS-$(CONFIG_RTE_LIBRTE_HV_PMD) += hyperv_rxtx.c +SRCS-$(CONFIG_RTE_LIBRTE_HV_PMD) += hyperv_drv.c + +# this lib depends upon: +DEPDIRS-$(CONFIG_RTE_LIBRTE_HV_PMD) += lib/librte_eal lib/librte_ether +DEPDIRS-$(CONFIG_RTE_LIBRTE_HV_PMD) += lib/librte_mempool lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_HV_PMD) += lib/librte_malloc + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_pmd_hyperv/hyperv.h b/lib/librte_pmd_hyperv/hyperv.h new file mode 100644 index 0000000..5f66d8a --- /dev/null +++ b/lib/librte_pmd_hyperv/hyperv.h @@ -0,0 +1,169 @@ +/*- + * Copyright (c) 2013-2015 Brocade Communications Systems, Inc. + * All rights reserved. + */ + +#ifndef _HYPERV_H_ +#define _HYPERV_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hyperv_logs.h" + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1 << PAGE_SHIFT) + +/* + * Tunable ethdev params + */ +#define HV_MIN_RX_BUF_SIZE 1024 +#define HV_MAX_RX_PKT_LEN 4096 +#define HV_MAX_MAC_ADDRS 1 +#define HV_MAX_RX_QUEUES 1 +#define HV_MAX_TX_QUEUES 1 +#define HV_MAX_PKT_BURST 32 +#define HV_MAX_LINK_REQ 10 + +/* + * List of resources mapped from kspace + * need to be the same as defined in hv_uio.c + */ +enum { + TXRX_RING_MAP, + INT_PAGE_MAP, + MON_PAGE_MAP, + RECV_BUF_MAP +}; + +/* + * Statistics + */ +struct hv_stats { + uint64_t opkts; + uint64_t obytes; + uint64_t oerrors; + + uint64_t ipkts; + uint64_t ibytes; + uint64_t ierrors; + uint64_t rx_nombuf; +}; + +struct hv_data; +struct netvsc_packet; +struct rndis_msg; +typedef void (*receive_callback_t)(struct hv_data *hv, struct rndis_msg *msg, + struct netvsc_packet *pkt); + +/* + * Main driver structure + */ +struct hv_data { + int vmbus_device; + uint8_t monitor_bit; + uint8_t monitor_group; + uint8_t kernel_initialized; + int uio_fd; + /* Flag indicates channel state. If closed, RX/TX shouldn't work further */ + uint8_t closed; + /* Flag indicates whether HALT rndis request was received by host */ + uint8_t hlt_req_sent; + /* Flag indicates pending state for HALT request */ + uint8_t hlt_req_pending; + /* Counter for RNDIS requests */ + uint32_t new_request_id; + /* State of RNDIS device */ + uint8_t rndis_dev_state; + /* Number of transmitted packets but not completed yet by Hyper-V */ + int num_outstanding_sends; + /* Max pkt len to fit in rx mbufs */ + uint32_t max_rx_pkt_len; + + uint8_t jumbo_frame_support; + + struct hv_vmbus_ring_buffer *in; + struct hv_vmbus_ring_buffer *out; + + /* Size of each ring_buffer(in/out) */ + uint32_t rb_size; + /* Size of data in each ring_buffer(in/out) */ + uint32_t rb_data_size; + + void *int_page; + struct hv_vmbus_monitor_page *monitor_pages; + void *recv_interrupt_page; + void *send_interrupt_page; + void *ring_pages; + void *recv_buf; + + uint8_t link_req_cnt; + uint32_t link_status; + uint8_t hw_mac_addr[ETHER_ADDR_LEN]; + struct rndis_request *req; + struct netvsc_packet *netvsc_packet; + struct nvsp_msg *rx_comp_msg; + struct hv_rx_queue *rxq; + struct hv_tx_queue *txq; + struct hv_vm_packet_descriptor *desc; + receive_callback_t receive_callback; + int pkt_rxed; + + uint32_t debug; + struct hv_stats stats; +}; + +/* + * Extern functions declarations + */ +int hyperv_dev_tx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_idx, + uint16_t nb_desc, + unsigned int socket_id, + const struct rte_eth_txconf *tx_conf); + +void hyperv_dev_tx_queue_release(void *ptxq); + +int hyperv_dev_rx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_idx, + uint16_t nb_desc, + unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf, + struct rte_mempool *mp); + +void hyperv_dev_rx_queue_release(void *prxq); + +uint16_t +hyperv_recv_pkts(void *prxq, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts); + +uint16_t +hyperv_xmit_pkts(void *ptxq, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); + +int hv_rf_on_device_add(struct hv_data *hv); +int hv_rf_on_device_remove(struct hv_data *hv); +int hv_rf_on_send(struct hv_data *hv, struct netvsc_packet *pkt); +int hv_rf_on_open(struct hv_data *hv); +int hv_rf_on_close(struct hv_data *hv); +int hv_rf_set_device_mac(struct hv_data *hv, uint8_t *mac); +void hyperv_start_rx(struct hv_data *hv); +void hyperv_stop_rx(struct hv_data *hv); +int hyperv_get_buffer(struct hv_data *hv, void *buffer, uint32_t bufferlen); +void hyperv_scan_comps(struct hv_data *hv, int allow_rx_drop); +uint8_t hyperv_get_link_status(struct hv_data *hv); +int hyperv_set_rx_mode(struct hv_data *hv, uint8_t promisc, uint8_t mcast); + +inline int rte_hv_dev_atomic_write_link_status(struct rte_eth_dev *dev, + struct rte_eth_link *link); +inline int rte_hv_dev_atomic_read_link_status(struct rte_eth_dev *dev, + struct rte_eth_link *link); + +#endif /* _HYPERV_H_ */ diff --git a/lib/librte_pmd_hyperv/hyperv_drv.c b/lib/librte_pmd_hyperv/hyperv_drv.c new file mode 100644 index 0000000..4a37966 --- /dev/null +++ b/lib/librte_pmd_hyperv/hyperv_drv.c @@ -0,0 +1,1653 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2013-2015 Brocade Communications Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hyperv.h" +#include "hyperv_drv.h" +#include "hyperv_rxtx.h" + +#define LOOP_CNT 10000 +#define MAC_STRLEN 14 +#define MAC_PARAM_STR "NetworkAddress" + +#define hex "0123456789abcdef" +#define high(x) hex[(x & 0xf0) >> 4] +#define low(x) hex[x & 0x0f] + +static int hv_rf_on_receive(struct hv_data *hv, struct netvsc_packet *pkt); + +/* + * Ring buffer + */ + +/* Amount of space to write to */ +#define HV_BYTES_AVAIL_TO_WRITE(r, w, z) \ + (((w) >= (r)) ? ((z) - ((w) - (r))) : ((r) - (w))) + +/* + * Get number of bytes available to read and to write to + * for the specified ring buffer + */ +static inline void +get_ring_buffer_avail_bytes( + struct hv_data *hv, + struct hv_vmbus_ring_buffer *ring_buffer, + uint32_t *read, + uint32_t *write) +{ + rte_compiler_barrier(); + + /* + * Capture the read/write indices before they changed + */ + uint32_t read_loc = ring_buffer->read_index; + uint32_t write_loc = ring_buffer->write_index; + + *write = HV_BYTES_AVAIL_TO_WRITE( + read_loc, write_loc, hv->rb_data_size); + *read = hv->rb_data_size - *write; +} + +/* + * Helper routine to copy from source to ring buffer. + * + * Assume there is enough room. Handles wrap-around in dest case only! + */ +static uint32_t +copy_to_ring_buffer( + struct hv_vmbus_ring_buffer *ring_buffer, + uint32_t ring_buffer_size, + uint32_t start_write_offset, + char *src, + uint32_t src_len) +{ + char *ring_buf = (char *)ring_buffer->buffer; + uint32_t fragLen; + + if (src_len > ring_buffer_size - start_write_offset) { + /* wrap-around detected! */ + fragLen = ring_buffer_size - start_write_offset; + rte_memcpy(ring_buf + start_write_offset, src, fragLen); + rte_memcpy(ring_buf, src + fragLen, src_len - fragLen); + } else { + rte_memcpy(ring_buf + start_write_offset, src, src_len); + } + + start_write_offset += src_len; + start_write_offset %= ring_buffer_size; + + return start_write_offset; +} + +/* + * Helper routine to copy to dest from ring buffer. + * + * Assume there is enough room. Handles wrap-around in src case only! + */ +static uint32_t +copy_from_ring_buffer( + struct hv_data *hv, + struct hv_vmbus_ring_buffer *ring_buffer, + char *dest, + uint32_t dest_len, + uint32_t start_read_offset) +{ + uint32_t fragLen; + char *ring_buf = (char *)ring_buffer->buffer; + + if (dest_len > hv->rb_data_size - start_read_offset) { + /* wrap-around detected at the src */ + fragLen = hv->rb_data_size - start_read_offset; + rte_memcpy(dest, ring_buf + start_read_offset, fragLen); + rte_memcpy(dest + fragLen, ring_buf, dest_len - fragLen); + } else { + rte_memcpy(dest, ring_buf + start_read_offset, dest_len); + } + + start_read_offset += dest_len; + start_read_offset %= hv->rb_data_size; + + return start_read_offset; +} + +/* + * Write to the ring buffer. + */ +static int +hv_ring_buffer_write( + struct hv_data *hv, + struct hv_vmbus_sg_buffer_list sg_buffers[], + uint32_t sg_buffer_count) +{ + struct hv_vmbus_ring_buffer *ring_buffer = hv->out; + uint32_t i = 0; + uint32_t byte_avail_to_write; + uint32_t byte_avail_to_read; + uint32_t total_bytes_to_write = 0; + volatile uint32_t next_write_location; + uint64_t prev_indices = 0; + + for (i = 0; i < sg_buffer_count; i++) + total_bytes_to_write += sg_buffers[i].length; + + total_bytes_to_write += sizeof(uint64_t); + + get_ring_buffer_avail_bytes(hv, ring_buffer, &byte_avail_to_read, + &byte_avail_to_write); + + /* + * If there is only room for the packet, assume it is full. + * Otherwise, the next time around, we think the ring buffer + * is empty since the read index == write index + */ + if (byte_avail_to_write <= total_bytes_to_write) { + PMD_PERROR_LOG(hv, DBG_RB, + "byte_avail_to_write = %u, total_bytes_to_write = %u", + byte_avail_to_write, total_bytes_to_write); + return -EAGAIN; + } + + /* + * Write to the ring buffer + */ + next_write_location = ring_buffer->write_index; + + for (i = 0; i < sg_buffer_count; i++) { + next_write_location = copy_to_ring_buffer(ring_buffer, + hv->rb_data_size, next_write_location, + (char *) sg_buffers[i].data, sg_buffers[i].length); + } + + /* + * Set previous packet start + */ + prev_indices = (uint64_t)ring_buffer->write_index << 32; + + next_write_location = copy_to_ring_buffer( + ring_buffer, hv->rb_data_size, next_write_location, + (char *) &prev_indices, sizeof(uint64_t)); + + /* + * Make sure we flush all writes before updating the writeIndex + */ + rte_compiler_barrier(); + + /* + * Now, update the write location + */ + ring_buffer->write_index = next_write_location; + + return 0; +} + +/* + * Read without advancing the read index. + */ +static int +hv_ring_buffer_peek(struct hv_data *hv, void *buffer, uint32_t buffer_len) +{ + struct hv_vmbus_ring_buffer *ring_buffer = hv->in; + uint32_t bytesAvailToWrite; + uint32_t bytesAvailToRead; + + get_ring_buffer_avail_bytes(hv, ring_buffer, + &bytesAvailToRead, + &bytesAvailToWrite); + + /* Make sure there is something to read */ + if (bytesAvailToRead < buffer_len) + return -EAGAIN; + + copy_from_ring_buffer(hv, ring_buffer, + (char *)buffer, buffer_len, ring_buffer->read_index); + + return 0; +} + +/* + * Read and advance the read index. + */ +static int +hv_ring_buffer_read(struct hv_data *hv, void *buffer, + uint32_t buffer_len, uint32_t offset) +{ + struct hv_vmbus_ring_buffer *ring_buffer = hv->in; + uint32_t bytes_avail_to_write; + uint32_t bytes_avail_to_read; + uint32_t next_read_location = 0; + uint64_t prev_indices = 0; + + if (buffer_len <= 0) + return -EINVAL; + + get_ring_buffer_avail_bytes( + hv, + ring_buffer, + &bytes_avail_to_read, + &bytes_avail_to_write); + + /* + * Make sure there is something to read + */ + if (bytes_avail_to_read < buffer_len) { + PMD_PERROR_LOG(hv, DBG_RB, "bytes_avail_to_read = %u, buffer_len = %u", + bytes_avail_to_read, buffer_len); + return -EAGAIN; + } + + next_read_location = (ring_buffer->read_index + offset) % hv->rb_data_size; + + next_read_location = copy_from_ring_buffer( + hv, + ring_buffer, + (char *) buffer, + buffer_len, + next_read_location); + + next_read_location = copy_from_ring_buffer( + hv, + ring_buffer, + (char *) &prev_indices, + sizeof(uint64_t), + next_read_location); + + /* + * Make sure all reads are done before we update the read index since + * the writer may start writing to the read area once the read index + * is updated. + */ + rte_compiler_barrier(); + + /* + * Update the read index + */ + ring_buffer->read_index = next_read_location; + + return 0; +} + +/* + * VMBus + */ + +/* + * Retrieve the raw packet on the specified channel + */ +static int +hv_vmbus_channel_recv_packet_raw(struct hv_data *hv, void *buffer, + uint32_t buffer_len, + uint32_t *buffer_actual_len, + uint64_t *request_id, + int mode) +{ + int ret; + uint32_t packetLen; + struct hv_vm_packet_descriptor desc; + + *buffer_actual_len = 0; + *request_id = 0; + + ret = hv_ring_buffer_peek(hv, &desc, + sizeof(struct hv_vm_packet_descriptor)); + + if (ret != 0) + return 0; + + if ((desc.type == HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES + && !(mode & 1)) || + ((desc.type == HV_VMBUS_PACKET_TYPE_COMPLETION) && !(mode & 2))) { + return -1; + } + + packetLen = desc.length8 << 3; + + *buffer_actual_len = packetLen; + + if (unlikely(packetLen > buffer_len)) { + PMD_PERROR_LOG(hv, DBG_RX, "The buffer desc is too big, will drop it"); + return -ENOMEM; + } + + *request_id = desc.transaction_id; + + /* Copy over the entire packet to the user buffer */ + ret = hv_ring_buffer_read(hv, buffer, packetLen, 0); + + return 0; +} + +/* + * Trigger an event notification on the specified channel + */ +static void +vmbus_channel_set_event(struct hv_data *hv) +{ + /* Here we assume that channel->offer_msg.monitor_allocated == 1, + * in another case our driver will not work */ + /* Each uint32_t represents 32 channels */ + __sync_or_and_fetch(((uint32_t *)hv->send_interrupt_page + + ((hv->vmbus_device >> 5))), 1 << (hv->vmbus_device & 31) + ); + __sync_or_and_fetch((uint32_t *)&hv->monitor_pages-> + trigger_group[hv->monitor_group].u.pending, 1 << hv->monitor_bit); +} + +/** + * @brief Send the specified buffer on the given channel + */ +static int +hv_vmbus_channel_send_packet(struct hv_data *hv, void *buffer, + uint32_t buffer_len, uint64_t request_id, + enum hv_vmbus_packet_type type, + uint32_t flags) +{ + struct hv_vmbus_sg_buffer_list buffer_list[3]; + struct hv_vm_packet_descriptor desc; + uint32_t packet_len_aligned; + uint64_t aligned_data; + uint32_t packet_len; + int ret = 0; + uint32_t old_write = hv->out->write_index; + + packet_len = sizeof(struct hv_vm_packet_descriptor) + buffer_len; + packet_len_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t)); + aligned_data = 0; + + /* Setup the descriptor */ + desc.type = type; /* HV_VMBUS_PACKET_TYPE_DATA_IN_BAND; */ + desc.flags = flags; /* HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED */ + /* in 8-bytes granularity */ + desc.data_offset8 = sizeof(struct hv_vm_packet_descriptor) >> 3; + desc.length8 = (uint16_t) (packet_len_aligned >> 3); + desc.transaction_id = request_id; + + buffer_list[0].data = &desc; + buffer_list[0].length = sizeof(struct hv_vm_packet_descriptor); + + buffer_list[1].data = buffer; + buffer_list[1].length = buffer_len; + + buffer_list[2].data = &aligned_data; + buffer_list[2].length = packet_len_aligned - packet_len; + + ret = hv_ring_buffer_write(hv, buffer_list, 3); + + rte_mb(); + if (!ret && !hv->out->interrupt_mask && hv->out->read_index == old_write) + vmbus_channel_set_event(hv); + + return ret; +} + +/* + * Send a range of single-page buffer packets using + * a GPADL Direct packet type + */ +static int +hv_vmbus_channel_send_packet_pagebuffer( + struct hv_data *hv, + struct hv_vmbus_page_buffer page_buffers[], + uint32_t page_count, + void *buffer, + uint32_t buffer_len, + uint64_t request_id) +{ + + int ret = 0; + uint32_t packet_len, packetLen_aligned, descSize, i = 0; + struct hv_vmbus_sg_buffer_list buffer_list[3]; + struct hv_vmbus_channel_packet_page_buffer desc; + uint64_t alignedData = 0; + uint32_t old_write = hv->out->write_index; + + if (page_count > HV_MAX_PAGE_BUFFER_COUNT) { + PMD_PERROR_LOG(hv, DBG_VMBUS, "page_count %u goes out of the limit", + page_count); + return -EINVAL; + } + + /* + * Adjust the size down since hv_vmbus_channel_packet_page_buffer + * is the largest size we support + */ + descSize = sizeof(struct hv_vmbus_channel_packet_page_buffer) - + ((HV_MAX_PAGE_BUFFER_COUNT - page_count) * + sizeof(struct hv_vmbus_page_buffer)); + packet_len = descSize + buffer_len; + packetLen_aligned = HV_ALIGN_UP(packet_len, sizeof(uint64_t)); + + /* Setup the descriptor */ + desc.type = HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT; + desc.flags = HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; + desc.data_offset8 = descSize >> 3; /* in 8-bytes granularity */ + desc.length8 = (uint16_t) (packetLen_aligned >> 3); + desc.transaction_id = request_id; + desc.range_count = page_count; + + for (i = 0; i < page_count; i++) { + desc.range[i].length = page_buffers[i].length; + desc.range[i].offset = page_buffers[i].offset; + desc.range[i].pfn = page_buffers[i].pfn; + } + + buffer_list[0].data = &desc; + buffer_list[0].length = descSize; + + buffer_list[1].data = buffer; + buffer_list[1].length = buffer_len; + + buffer_list[2].data = &alignedData; + buffer_list[2].length = packetLen_aligned - packet_len; + + ret = hv_ring_buffer_write(hv, buffer_list, 3); + if (likely(ret == 0)) + ++hv->num_outstanding_sends; + + rte_mb(); + if (!ret && !hv->out->interrupt_mask && + hv->out->read_index == old_write) + vmbus_channel_set_event(hv); + + return ret; +} + +/* + * NetVSC + */ + +/* + * Net VSC on send + * Sends a packet on the specified Hyper-V device. + * Returns 0 on success, non-zero on failure. + */ +static int +hv_nv_on_send(struct hv_data *hv, struct netvsc_packet *pkt) +{ + struct nvsp_msg send_msg; + int ret; + + send_msg.msg_type = nvsp_msg_1_type_send_rndis_pkt; + if (pkt->is_data_pkt) { + /* 0 is RMC_DATA */ + send_msg.msgs.send_rndis_pkt.chan_type = 0; + } else { + /* 1 is RMC_CONTROL */ + send_msg.msgs.send_rndis_pkt.chan_type = 1; + } + + /* Not using send buffer section */ + send_msg.msgs.send_rndis_pkt.send_buf_section_idx = + 0xFFFFFFFF; + send_msg.msgs.send_rndis_pkt.send_buf_section_size = 0; + + if (likely(pkt->page_buf_count)) { + ret = hv_vmbus_channel_send_packet_pagebuffer(hv, + pkt->page_buffers, pkt->page_buf_count, + &send_msg, sizeof(struct nvsp_msg), + (uint64_t)pkt->is_data_pkt ? (hv->txq->tx_tail + 1) : 0); + } else { + PMD_PERROR_LOG(hv, DBG_TX, "pkt->page_buf_count value can't be zero"); + ret = -1; + } + + return ret; +} + +/* + * Net VSC on receive + * + * This function deals exclusively with virtual addresses. + */ +static void +hv_nv_on_receive(struct hv_data *hv, struct hv_vm_packet_descriptor *pkt) +{ + struct hv_vm_transfer_page_packet_header *vm_xfer_page_pkt; + struct nvsp_msg *nvsp_msg_pkt; + struct netvsc_packet *net_vsc_pkt = NULL; + unsigned long start; + int count, i; + + nvsp_msg_pkt = (struct nvsp_msg *)((unsigned long)pkt + + (pkt->data_offset8 << 3)); + + /* Make sure this is a valid nvsp packet */ + if (unlikely(nvsp_msg_pkt->msg_type != nvsp_msg_1_type_send_rndis_pkt)) { + PMD_PERROR_LOG(hv, DBG_RX, "NVSP packet is not valid"); + return; + } + + vm_xfer_page_pkt = (struct hv_vm_transfer_page_packet_header *)pkt; + + if (unlikely(vm_xfer_page_pkt->transfer_page_set_id + != NETVSC_RECEIVE_BUFFER_ID)) { + PMD_PERROR_LOG(hv, DBG_RX, "transfer_page_set_id is not valid"); + return; + } + + count = vm_xfer_page_pkt->range_count; + + /* + * Initialize the netvsc packet + */ + for (i = 0; i < count; ++i) { + net_vsc_pkt = hv->netvsc_packet; + + net_vsc_pkt->tot_data_buf_len = + vm_xfer_page_pkt->ranges[i].byte_count; + net_vsc_pkt->page_buf_count = 1; + + net_vsc_pkt->page_buffers[0].length = + vm_xfer_page_pkt->ranges[i].byte_count; + + /* The virtual address of the packet in the receive buffer */ + start = ((unsigned long)hv->recv_buf + + vm_xfer_page_pkt->ranges[i].byte_offset); + + /* Page number of the virtual page containing packet start */ + net_vsc_pkt->page_buffers[0].pfn = start >> PAGE_SHIFT; + + /* Calculate the page relative offset */ + net_vsc_pkt->page_buffers[0].offset = + vm_xfer_page_pkt->ranges[i].byte_offset & (PAGE_SIZE - 1); + + /* + * In this implementation, we are dealing with virtual + * addresses exclusively. Since we aren't using physical + * addresses at all, we don't care if a packet crosses a + * page boundary. For this reason, the original code to + * check for and handle page crossings has been removed. + */ + + /* + * Pass it to the upper layer. The receive completion call + * has been moved into this function. + */ + hv_rf_on_receive(hv, net_vsc_pkt); + } + /* Send a receive completion packet to RNDIS device (ie NetVsp) */ + hv_vmbus_channel_send_packet(hv, hv->rx_comp_msg, sizeof(struct nvsp_msg), + vm_xfer_page_pkt->d.transaction_id, + HV_VMBUS_PACKET_TYPE_COMPLETION, 0); +} + +/* + * Net VSC on send completion + */ +static void +hv_nv_on_send_completion(struct hv_data *hv, struct hv_vm_packet_descriptor *pkt) +{ + struct nvsp_msg *nvsp_msg_pkt; + + nvsp_msg_pkt = + (struct nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3)); + + if (likely(nvsp_msg_pkt->msg_type == + nvsp_msg_1_type_send_rndis_pkt_complete)) { + + if (unlikely(hv->hlt_req_pending)) + hv->hlt_req_sent = 1; + else + if (pkt->transaction_id) + ++hv->txq->tx_free; + --hv->num_outstanding_sends; + return; + } + PMD_PINFO_LOG(hv, DBG_TX, "unhandled completion (for kernel req or so)"); +} + +/* + * Analogue of bsd hv_nv_on_channel_callback + */ +static void +hv_nv_complete_request(struct hv_data *hv, struct rndis_request *request) +{ + uint32_t bytes_rxed, cnt = 0; + uint64_t request_id; + struct hv_vm_packet_descriptor *desc; + uint8_t *buffer; + int bufferlen = NETVSC_PACKET_SIZE; + int ret = 0; + + PMD_INIT_FUNC_TRACE(); + + hv->req = request; + + buffer = rte_malloc(NULL, bufferlen, RTE_CACHE_LINE_SIZE); + if (!buffer) { + PMD_PERROR_LOG(hv, DBG_LOAD, "failed to allocate packet"); + return; + } + + do { + rte_delay_us(1); + ret = hv_vmbus_channel_recv_packet_raw(hv, + buffer, bufferlen, &bytes_rxed, &request_id, 3); + if (ret == 0) { + if (bytes_rxed > 0) { + desc = (struct hv_vm_packet_descriptor *)buffer; + + switch (desc->type) { + case HV_VMBUS_PACKET_TYPE_COMPLETION: + hv_nv_on_send_completion(hv, desc); + break; + case HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES: + hv_nv_on_receive(hv, desc); + break; + default: + break; + } + PMD_PDEBUG_LOG(hv, DBG_LOAD, + "Did %d attempts until non-empty data was receieved", + cnt); + cnt = 0; + } else { + cnt++; + } + } else if (ret == -ENOMEM) { + /* Handle large packet */ + PMD_PDEBUG_LOG(hv, DBG_LOAD, + "recv_packet_raw returned -ENOMEM"); + rte_free(buffer); + buffer = rte_malloc(NULL, bytes_rxed, RTE_CACHE_LINE_SIZE); + if (buffer == NULL) { + PMD_PERROR_LOG(hv, DBG_LOAD, "failed to allocate buffer"); + break; + } + bufferlen = bytes_rxed; + } else { + PMD_PERROR_LOG(hv, DBG_LOAD, "Unexpected return code (%d)", ret); + } + if (!hv->req) { + PMD_PINFO_LOG(hv, DBG_LOAD, "Single request processed"); + break; + } + if (cnt >= LOOP_CNT) { + PMD_PERROR_LOG(hv, DBG_LOAD, "Emergency break from the loop"); + break; + } + if (hv->hlt_req_sent) { + PMD_PINFO_LOG(hv, DBG_LOAD, "Halt request processed"); + break; + } + /* The field hv->req->response_msg.ndis_msg_type + * should be set to non-zero value when response received + */ + } while (!hv->req->response_msg.ndis_msg_type); + + rte_free(buffer); +} + +/* + * RNDIS + */ + +/* + * Create new RNDIS request + */ +static inline struct rndis_request * +hv_rndis_request(struct hv_data *hv, uint32_t message_type, + uint32_t message_length) +{ + struct rndis_request *request; + struct rndis_msg *rndis_mesg; + struct rndis_set_request *set; + char mz_name[RTE_MEMZONE_NAMESIZE]; + uint32_t size; + + PMD_INIT_FUNC_TRACE(); + + request = rte_zmalloc("rndis_req", sizeof(struct rndis_request), + RTE_CACHE_LINE_SIZE); + + if (!request) + return NULL; + + sprintf(mz_name, "hv_%d_%u_%d_%p", hv->vmbus_device, message_type, + hv->new_request_id, request); + + size = MAX(message_length, sizeof(struct rndis_msg)); + + request->request_msg_memzone = rte_memzone_reserve_aligned(mz_name, + size, rte_lcore_to_socket_id(rte_lcore_id()), 0, PAGE_SIZE); + if (!request->request_msg_memzone) { + PMD_PERROR_LOG(hv, DBG_LOAD, "memzone_reserve failed"); + rte_free(request); + return NULL; + } + request->request_msg = request->request_msg_memzone->addr; + rndis_mesg = request->request_msg; + rndis_mesg->ndis_msg_type = message_type; + rndis_mesg->msg_len = message_length; + + /* + * Set the request id. This field is always after the rndis header + * for request/response packet types so we just use the set_request + * as a template. + */ + set = &rndis_mesg->msg.set_request; + hv->new_request_id++; + set->request_id = hv->new_request_id; + + return request; +} + +/* + * RNDIS filter + */ + +static void +hv_rf_receive_response( + struct hv_data *hv, + struct rndis_msg *response) +{ + struct rndis_request *request = hv->req; + + PMD_INIT_FUNC_TRACE(); + + if (response->msg_len <= sizeof(struct rndis_msg)) { + rte_memcpy(&request->response_msg, response, + response->msg_len); + } else { + if (response->ndis_msg_type == REMOTE_NDIS_INITIALIZE_CMPLT) { + request->response_msg.msg.init_complete.status = + STATUS_BUFFER_OVERFLOW; + } + PMD_PERROR_LOG(hv, DBG_LOAD, "response buffer overflow\n"); + } +} + +/* + * RNDIS filter receive indicate status + */ +static void +hv_rf_receive_indicate_status(struct hv_data *hv, struct rndis_msg *response) +{ + struct rndis_indicate_status *indicate = &response->msg.indicate_status; + + PMD_INIT_FUNC_TRACE(); + + if (indicate->status == RNDIS_STATUS_MEDIA_CONNECT) + hv->link_status = 1; + else if (indicate->status == RNDIS_STATUS_MEDIA_DISCONNECT) + hv->link_status = 0; + else if (indicate->status == RNDIS_STATUS_INVALID_DATA) + PMD_PERROR_LOG(hv, DBG_RX, "Invalid data in RNDIS message"); + else + PMD_PERROR_LOG(hv, DBG_RX, "Unsupported status: %u", indicate->status); +} + +/* + * RNDIS filter receive data + */ +static void +hv_rf_receive_data(struct hv_data *hv, struct rndis_msg *msg, + struct netvsc_packet *pkt) +{ + struct rte_mbuf *m_new; + struct hv_rx_queue *rxq = hv->rxq; + struct rndis_packet *rndis_pkt; + uint32_t data_offset; + + if (unlikely(hv->closed)) + return; + + rndis_pkt = &msg->msg.packet; + + if (unlikely(hv->max_rx_pkt_len < rndis_pkt->data_length)) { + PMD_PWARN_LOG(hv, DBG_RX, "Packet is too large (%db), dropping.", + rndis_pkt->data_length); + ++hv->stats.ierrors; + return; + } + + /* Remove rndis header, then pass data packet up the stack */ + data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset; + + /* L2 frame length, with L2 header, not including CRC */ + pkt->tot_data_buf_len = rndis_pkt->data_length; + pkt->page_buffers[0].offset += data_offset; + /* Buffer length now L2 frame length plus trailing junk */ + pkt->page_buffers[0].length -= data_offset; + + pkt->vlan_tci = 0; + + /* + * Just put data into appropriate mbuf, all further work will be done + * by the upper layer (mbuf replacement, index adjustment, etc) + */ + m_new = rxq->sw_ring[rxq->rx_tail]; + if (++rxq->rx_tail == rxq->nb_rx_desc) + rxq->rx_tail = 0; + + /* + * Copy the received packet to mbuf. + * The copy is required since the memory pointed to by netvsc_packet + * cannot be reallocated + */ + uint8_t *vaddr = (uint8_t *) + (pkt->page_buffers[0].pfn << PAGE_SHIFT) + + pkt->page_buffers[0].offset; + + m_new->nb_segs = 1; + m_new->pkt_len = m_new->data_len = pkt->tot_data_buf_len; + rte_memcpy(rte_pktmbuf_mtod(m_new, void *), vaddr, m_new->data_len); + + if (pkt->vlan_tci) { + m_new->vlan_tci = pkt->vlan_tci; + m_new->ol_flags |= PKT_RX_VLAN_PKT; + } + + hv->pkt_rxed = 1; +} + +/* + * RNDIS filter receive data, jumbo frames support + */ +static void +hv_rf_receive_data_sg(struct hv_data *hv, struct rndis_msg *msg, + struct netvsc_packet *pkt) +{ + struct rte_mbuf *m_new; + struct hv_rx_queue *rxq = hv->rxq; + struct rndis_packet *rndis_pkt; + uint32_t data_offset; + + if (unlikely(hv->closed)) + return; + + rndis_pkt = &msg->msg.packet; + + /* Remove rndis header, then pass data packet up the stack */ + data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset; + + /* L2 frame length, with L2 header, not including CRC */ + pkt->tot_data_buf_len = rndis_pkt->data_length; + pkt->page_buffers[0].offset += data_offset; + /* Buffer length now L2 frame length plus trailing junk */ + pkt->page_buffers[0].length -= data_offset; + + pkt->vlan_tci = 0; + + /* + * Just put data into appropriate mbuf, all further work will be done + * by the upper layer (mbuf replacement, index adjustment, etc) + */ + m_new = rxq->sw_ring[rxq->rx_tail]; + if (++rxq->rx_tail == rxq->nb_rx_desc) + rxq->rx_tail = 0; + + /* + * Copy the received packet to mbuf. + * The copy is required since the memory pointed to by netvsc_packet + * cannot be reallocated + */ + uint8_t *vaddr = (uint8_t *) + (pkt->page_buffers[0].pfn << PAGE_SHIFT) + + pkt->page_buffers[0].offset; + + /* Scatter-gather emulation */ + uint32_t carry_len = pkt->tot_data_buf_len; + struct rte_mbuf *m_next; + + m_new->pkt_len = carry_len; + m_new->nb_segs = (carry_len - 1) / hv->max_rx_pkt_len + 1; + + while (1) { + m_new->data_len = MIN(carry_len, hv->max_rx_pkt_len); + rte_memcpy(rte_pktmbuf_mtod(m_new, void *), + vaddr, m_new->data_len); + vaddr += m_new->data_len; + + if (carry_len <= hv->max_rx_pkt_len) + break; + + carry_len -= hv->max_rx_pkt_len; + m_next = rxq->sw_ring[rxq->rx_tail]; + if (++rxq->rx_tail == rxq->nb_rx_desc) + rxq->rx_tail = 0; + m_new->next = m_next; + m_new = m_next; + } + + if (pkt->vlan_tci) { + m_new->vlan_tci = pkt->vlan_tci; + m_new->ol_flags |= PKT_RX_VLAN_PKT; + } + + hv->pkt_rxed = 1; +} + +static int +hv_rf_send_request(struct hv_data *hv, struct rndis_request *request) +{ + struct netvsc_packet *packet; + + PMD_INIT_FUNC_TRACE(); + /* Set up the packet to send it */ + packet = &request->pkt; + + packet->is_data_pkt = 0; + packet->tot_data_buf_len = request->request_msg->msg_len; + packet->page_buf_count = 1; + + packet->page_buffers[0].pfn = + (request->request_msg_memzone->phys_addr) >> PAGE_SHIFT; + packet->page_buffers[0].length = request->request_msg->msg_len; + packet->page_buffers[0].offset = + (unsigned long)request->request_msg & (PAGE_SIZE - 1); + + return hv_nv_on_send(hv, packet); +} + +static void u8_to_u16(const char *src, int len, char *dst) +{ + int i; + + for (i = 0; i < len; ++i) { + dst[2 * i] = src[i]; + dst[2 * i + 1] = 0; + } +} + +int +hv_rf_set_device_mac(struct hv_data *hv, uint8_t *macaddr) +{ + struct rndis_request *request; + struct rndis_set_request *set_request; + struct rndis_config_parameter_info *info; + struct rndis_set_complete *set_complete; + char mac_str[2*ETHER_ADDR_LEN+1]; + wchar_t *param_value, *param_name; + uint32_t status; + uint32_t message_len = sizeof(struct rndis_config_parameter_info) + + 2 * MAC_STRLEN + 4 * ETHER_ADDR_LEN; + int ret, i; + + request = hv_rndis_request(hv, REMOTE_NDIS_SET_MSG, + RNDIS_MESSAGE_SIZE(struct rndis_set_request) + message_len); + if (!request) + return -ENOMEM; + + set_request = &request->request_msg->msg.set_request; + set_request->oid = RNDIS_OID_GEN_RNDIS_CONFIG_PARAMETER; + set_request->device_vc_handle = 0; + set_request->info_buffer_offset = sizeof(struct rndis_set_request); + set_request->info_buffer_length = message_len; + + info = (struct rndis_config_parameter_info *)((ulong)set_request + + set_request->info_buffer_offset); + info->parameter_type = RNDIS_CONFIG_PARAM_TYPE_STRING; + info->parameter_name_offset = + sizeof(struct rndis_config_parameter_info); + info->parameter_name_length = 2 * MAC_STRLEN; + info->parameter_value_offset = + info->parameter_name_offset + info->parameter_name_length; + /* Multiply by 2 because of string representation and by 2 + * because of utf16 representation + */ + info->parameter_value_length = 4 * ETHER_ADDR_LEN; + param_name = (wchar_t *)((ulong)info + info->parameter_name_offset); + param_value = (wchar_t *)((ulong)info + info->parameter_value_offset); + + u8_to_u16(MAC_PARAM_STR, MAC_STRLEN, (char *)param_name); + for (i = 0; i < ETHER_ADDR_LEN; ++i) { + mac_str[2*i] = high(macaddr[i]); + mac_str[2*i+1] = low(macaddr[i]); + } + + u8_to_u16((const char *)mac_str, 2 * ETHER_ADDR_LEN, (char *)param_value); + + ret = hv_rf_send_request(hv, request); + if (ret) + goto cleanup; + + request->response_msg.msg.set_complete.status = 0xFFFF; + hv_nv_complete_request(hv, request); + set_complete = &request->response_msg.msg.set_complete; + if (set_complete->status == 0xFFFF) { + /* Host is not responding, we can't free request in this case */ + ret = -1; + PMD_PERROR_LOG(hv, DBG_LOAD, "Host is not responding"); + goto exit; + } + /* Response received, check status */ + status = set_complete->status; + if (status) { + /* Bad response status, return error */ + PMD_PERROR_LOG(hv, DBG_LOAD, "set_complete->status = %u\n", status); + ret = -EINVAL; + } + +cleanup: + rte_free(request); +exit: + return ret; +} + +/* + * RNDIS filter on receive + */ +static int +hv_rf_on_receive(struct hv_data *hv, struct netvsc_packet *pkt) +{ + struct rndis_msg rndis_mesg; + struct rndis_msg *rndis_hdr; + + /* Shift virtual page number to form virtual page address */ + rndis_hdr = (struct rndis_msg *)(pkt->page_buffers[0].pfn << PAGE_SHIFT); + + rndis_hdr = (void *)((unsigned long)rndis_hdr + + pkt->page_buffers[0].offset); + + /* + * Make sure we got a valid rndis message + * Fixme: There seems to be a bug in set completion msg where + * its msg_len is 16 bytes but the byte_count field in the + * xfer page range shows 52 bytes + */ + if (unlikely(pkt->tot_data_buf_len != rndis_hdr->msg_len)) { + ++hv->stats.ierrors; + PMD_PERROR_LOG(hv, DBG_RX, + "invalid rndis message? (expected %u " + "bytes got %u)... dropping this message", + rndis_hdr->msg_len, pkt->tot_data_buf_len); + return -1; + } + + rte_memcpy(&rndis_mesg, rndis_hdr, + (rndis_hdr->msg_len > sizeof(struct rndis_msg)) ? + sizeof(struct rndis_msg) : rndis_hdr->msg_len); + + switch (rndis_mesg.ndis_msg_type) { + + /* data message */ + case REMOTE_NDIS_PACKET_MSG: + hv->receive_callback(hv, &rndis_mesg, pkt); + break; + /* completion messages */ + case REMOTE_NDIS_INITIALIZE_CMPLT: + case REMOTE_NDIS_QUERY_CMPLT: + case REMOTE_NDIS_SET_CMPLT: + case REMOTE_NDIS_RESET_CMPLT: + case REMOTE_NDIS_KEEPALIVE_CMPLT: + hv_rf_receive_response(hv, &rndis_mesg); + break; + /* notification message */ + case REMOTE_NDIS_INDICATE_STATUS_MSG: + hv_rf_receive_indicate_status(hv, &rndis_mesg); + break; + default: + PMD_PERROR_LOG(hv, DBG_RX, "hv_rf_on_receive(): Unknown msg_type 0x%x", + rndis_mesg.ndis_msg_type); + break; + } + + return 0; +} + +/* + * RNDIS filter on send + */ +int +hv_rf_on_send(struct hv_data *hv, struct netvsc_packet *pkt) +{ + struct rndis_msg *rndis_mesg; + struct rndis_packet *rndis_pkt; + uint32_t rndis_msg_size; + + /* Add the rndis header */ + rndis_mesg = (struct rndis_msg *)pkt->extension; + + memset(rndis_mesg, 0, sizeof(struct rndis_msg)); + + rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet); + + rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; + rndis_mesg->msg_len = pkt->tot_data_buf_len + rndis_msg_size; + + rndis_pkt = &rndis_mesg->msg.packet; + rndis_pkt->data_offset = sizeof(struct rndis_packet); + rndis_pkt->data_length = pkt->tot_data_buf_len; + + pkt->is_data_pkt = 1; + + /* + * Invoke netvsc send. If return status is bad, the caller now + * resets the context pointers before retrying. + */ + return hv_nv_on_send(hv, pkt); +} + +static int +hv_rf_init_device(struct hv_data *hv) +{ + struct rndis_request *request; + struct rndis_initialize_request *init; + struct rndis_initialize_complete *init_complete; + uint32_t status; + int ret; + + PMD_INIT_FUNC_TRACE(); + + request = hv_rndis_request(hv, REMOTE_NDIS_INITIALIZE_MSG, + RNDIS_MESSAGE_SIZE(struct rndis_initialize_request)); + if (!request) { + ret = -1; + goto cleanup; + } + + /* Set up the rndis set */ + init = &request->request_msg->msg.init_request; + init->major_version = RNDIS_MAJOR_VERSION; + init->minor_version = RNDIS_MINOR_VERSION; + /* + * Per the RNDIS document, this should be set to the max MTU + * plus the header size. However, 2048 works fine, so leaving + * it as is. + */ + init->max_xfer_size = 2048; + + hv->rndis_dev_state = RNDIS_DEV_INITIALIZING; + + ret = hv_rf_send_request(hv, request); + if (ret != 0) { + hv->rndis_dev_state = RNDIS_DEV_UNINITIALIZED; + goto cleanup; + } + + /* Putting -1 here to ensure that HyperV really answered us */ + request->response_msg.msg.init_complete.status = -1; + hv_nv_complete_request(hv, request); + + init_complete = &request->response_msg.msg.init_complete; + status = init_complete->status; + if (status == 0) { + PMD_PINFO_LOG(hv, DBG_LOAD, "Remote NDIS device is initialized"); + hv->rndis_dev_state = RNDIS_DEV_INITIALIZED; + ret = 0; + } else { + PMD_PINFO_LOG(hv, DBG_LOAD, "Remote NDIS device left uninitialized"); + hv->rndis_dev_state = RNDIS_DEV_UNINITIALIZED; + ret = -1; + } + +cleanup: + rte_free(request); + + return ret; +} + +/* + * RNDIS filter query device + */ +static int +hv_rf_query_device(struct hv_data *hv, uint32_t oid, void *result, + uint32_t result_size) +{ + struct rndis_request *request; + struct rndis_query_request *query; + struct rndis_query_complete *query_complete; + int ret = 0; + + PMD_INIT_FUNC_TRACE(); + + request = hv_rndis_request(hv, REMOTE_NDIS_QUERY_MSG, + RNDIS_MESSAGE_SIZE(struct rndis_query_request)); + if (request == NULL) { + ret = -1; + goto cleanup; + } + + /* Set up the rndis query */ + query = &request->request_msg->msg.query_request; + query->oid = oid; + query->info_buffer_offset = sizeof(struct rndis_query_request); + query->info_buffer_length = 0; + query->device_vc_handle = 0; + + ret = hv_rf_send_request(hv, request); + if (ret != 0) { + PMD_PERROR_LOG(hv, DBG_TX, "RNDISFILTER request failed to Send!"); + goto cleanup; + } + + hv_nv_complete_request(hv, request); + + /* Copy the response back */ + query_complete = &request->response_msg.msg.query_complete; + + if (query_complete->info_buffer_length > result_size) { + ret = -EINVAL; + goto cleanup; + } + + rte_memcpy(result, (void *)((unsigned long)query_complete + + query_complete->info_buffer_offset), + query_complete->info_buffer_length); + +cleanup: + rte_free(request); + + return ret; +} + +/* + * RNDIS filter query device MAC address + */ +static inline int +hv_rf_query_device_mac(struct hv_data *hv) +{ + uint32_t size = HW_MACADDR_LEN; + + int ret = hv_rf_query_device(hv, RNDIS_OID_802_3_PERMANENT_ADDRESS, + &hv->hw_mac_addr, size); + PMD_PDEBUG_LOG(hv, DBG_TX, "MAC: %02x:%02x:%02x:%02x:%02x:%02x, ret = %d", + hv->hw_mac_addr[0], hv->hw_mac_addr[1], hv->hw_mac_addr[2], + hv->hw_mac_addr[3], hv->hw_mac_addr[4], hv->hw_mac_addr[5], + ret); + return ret; +} + +/* + * RNDIS filter query device link status + */ +static inline int +hv_rf_query_device_link_status(struct hv_data *hv) +{ + uint32_t size = sizeof(uint32_t); + /* Set all bits to 1, it's to ensure that the response is actual */ + uint32_t status = -1; + + int ret = hv_rf_query_device(hv, RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, + &status, size); + hv->link_status = status ? 0 : 1; + PMD_PDEBUG_LOG(hv, DBG_TX, "Link Status: %s", + hv->link_status ? "Up" : "Down"); + return ret; +} + +int +hv_rf_on_device_add(struct hv_data *hv) +{ + int ret; + + PMD_INIT_FUNC_TRACE(); + + hv->closed = 0; + hv->rb_data_size = hv->rb_size - sizeof(struct hv_vmbus_ring_buffer); + PMD_PDEBUG_LOG(hv, DBG_LOAD, "hv->rb_data_size = %u", hv->rb_data_size); + + if (unlikely(hv->in->interrupt_mask == 0)) { + PMD_PINFO_LOG(hv, DBG_LOAD, "Disabling interrupts from host"); + hv->in->interrupt_mask = 1; + rte_mb(); + } + + hv->netvsc_packet = rte_zmalloc("", sizeof(struct netvsc_packet), + RTE_CACHE_LINE_SIZE); + if (hv->netvsc_packet == NULL) + return -ENOMEM; + hv->netvsc_packet->is_data_pkt = 1; + + hv->rx_comp_msg = rte_zmalloc("", sizeof(struct nvsp_msg), + RTE_CACHE_LINE_SIZE); + if (hv->rx_comp_msg == NULL) + return -ENOMEM; + + hv->rx_comp_msg->msg_type = nvsp_msg_1_type_send_rndis_pkt_complete; + hv->rx_comp_msg->msgs.send_rndis_pkt_complete.status = + nvsp_status_success; + + memset(&hv->stats, 0, sizeof(struct hv_stats)); + + hv->receive_callback = hv_rf_receive_data; + + /* It's for completion of requests which were sent from kernel-space part */ + hv_nv_complete_request(hv, NULL); + hv_nv_complete_request(hv, NULL); + + hv->rndis_dev_state = RNDIS_DEV_UNINITIALIZED; + + /* Send the rndis initialization message */ + ret = hv_rf_init_device(hv); + if (ret != 0) { + PMD_PERROR_LOG(hv, DBG_LOAD, "rndis init failed!"); + hv_rf_on_device_remove(hv); + return ret; + } + + /* Get the mac address */ + ret = hv_rf_query_device_mac(hv); + if (ret != 0) { + PMD_PERROR_LOG(hv, DBG_LOAD, "rndis query mac failed!"); + hv_rf_on_device_remove(hv); + return ret; + } + + return ret; +} + +#define HALT_COMPLETION_WAIT_COUNT 25 + +/* + * RNDIS filter halt device + */ +static int +hv_rf_halt_device(struct hv_data *hv) +{ + struct rndis_request *request; + struct rndis_halt_request *halt; + int i, ret; + + PMD_INIT_FUNC_TRACE(); + + /* Attempt to do a rndis device halt */ + request = hv_rndis_request(hv, REMOTE_NDIS_HALT_MSG, + RNDIS_MESSAGE_SIZE(struct rndis_halt_request)); + if (!request) { + PMD_PERROR_LOG(hv, DBG_LOAD, "Unable to create RNDIS_HALT request"); + return -1; + } + + /* initialize "poor man's semaphore" */ + hv->hlt_req_sent = 0; + + /* Set up the rndis set */ + halt = &request->request_msg->msg.halt_request; + hv->new_request_id++; + halt->request_id = hv->new_request_id; + + ret = hv_rf_send_request(hv, request); + if (ret) { + PMD_PERROR_LOG(hv, DBG_LOAD, "Failed to send RNDIS_HALT request: %d", + ret); + return ret; + } + + /* + * Wait for halt response from halt callback. We must wait for + * the transaction response before freeing the request and other + * resources. + */ + for (i = HALT_COMPLETION_WAIT_COUNT; i > 0; i--) { + hv_nv_complete_request(hv, request); + if (hv->hlt_req_sent != 0) { + PMD_PDEBUG_LOG(hv, DBG_LOAD, "Completed HALT request at %d try", + HALT_COMPLETION_WAIT_COUNT - i + 1); + break; + } + } + hv->hlt_req_sent = 0; + if (i == 0) { + PMD_PERROR_LOG(hv, DBG_LOAD, "RNDIS_HALT request was not completed!"); + rte_free(request); + return -1; + } + + hv->rndis_dev_state = RNDIS_DEV_UNINITIALIZED; + + rte_free(request); + + return 0; +} + +#define HV_TX_DRAIN_TRIES 50 +static inline int +hyperv_tx_drain(struct hv_data *hv) +{ + int i = HV_TX_DRAIN_TRIES; + + PMD_PDEBUG_LOG(hv, DBG_LOAD, "Waiting for TXs to be completed..."); + while (hv->num_outstanding_sends > 0 && --i) { + hv_nv_complete_request(hv, NULL); + rte_delay_ms(100); + } + + return hv->num_outstanding_sends; +} + +/* + * RNDIS filter on device remove + */ +int +hv_rf_on_device_remove(struct hv_data *hv) +{ + int ret; + + PMD_INIT_FUNC_TRACE(); + hv->closed = 1; + if (hyperv_tx_drain(hv) > 0) { + /* Hypervisor is not responding, exit with error here */ + PMD_PWARN_LOG(hv, DBG_LOAD, "Can't drain TX queue: no response"); + return -EAGAIN; + } + PMD_PDEBUG_LOG(hv, DBG_LOAD, "TX queue is empty, can halt the device"); + + /* Halt and release the rndis device */ + hv->hlt_req_pending = 1; + ret = hv_rf_halt_device(hv); + hv->hlt_req_pending = 0; + + rte_free(hv->netvsc_packet); + + return ret; +} + +/* + * RNDIS filter set packet filter + * Sends an rndis request with the new filter, then waits for a response + * from the host. + * Returns zero on success, non-zero on failure. + */ +static int +hv_rf_set_packet_filter(struct hv_data *hv, uint32_t new_filter) +{ + struct rndis_request *request; + struct rndis_set_request *set; + struct rndis_set_complete *set_complete; + uint32_t status; + int ret; + + PMD_INIT_FUNC_TRACE(); + + request = hv_rndis_request(hv, REMOTE_NDIS_SET_MSG, + RNDIS_MESSAGE_SIZE(struct rndis_set_request) + sizeof(uint32_t)); + if (!request) { + ret = -1; + goto cleanup; + } + + /* Set up the rndis set */ + set = &request->request_msg->msg.set_request; + set->oid = RNDIS_OID_GEN_CURRENT_PACKET_FILTER; + set->info_buffer_length = sizeof(uint32_t); + set->info_buffer_offset = sizeof(struct rndis_set_request); + + rte_memcpy((void *)((unsigned long)set + sizeof(struct rndis_set_request)), + &new_filter, sizeof(uint32_t)); + + ret = hv_rf_send_request(hv, request); + if (ret) + goto cleanup; + + /* + * Wait for the response from the host. + */ + request->response_msg.msg.set_complete.status = 0xFFFF; + hv_nv_complete_request(hv, request); + + set_complete = &request->response_msg.msg.set_complete; + if (set_complete->status == 0xFFFF) { + /* Host is not responding, we can't free request in this case */ + ret = -1; + goto exit; + } + /* Response received, check status */ + status = set_complete->status; + if (status) + /* Bad response status, return error */ + ret = -2; + +cleanup: + rte_free(request); +exit: + return ret; +} + +/* + * RNDIS filter open device + */ +int +hv_rf_on_open(struct hv_data *hv) +{ + int ret; + + if (hv->closed) + return 0; + + if (hv->jumbo_frame_support) + hv->receive_callback = hv_rf_receive_data_sg; + + ret = hyperv_set_rx_mode(hv, 1, 0); + if (!ret) { + PMD_PDEBUG_LOG(hv, DBG_LOAD, "RNDIS device opened"); + hv->rndis_dev_state = RNDIS_DEV_DATAINITIALIZED; + } else + PMD_PERROR_LOG(hv, DBG_LOAD, "RNDIS device is left unopened"); + + return ret; +} + +/* + * RNDIS filter on close + */ +int +hv_rf_on_close(struct hv_data *hv) +{ + int ret; + + PMD_INIT_FUNC_TRACE(); + + if (hv->closed) + return 0; + + if (hv->rndis_dev_state != RNDIS_DEV_DATAINITIALIZED) { + PMD_PDEBUG_LOG(hv, DBG_LOAD, "RNDIS device state should be" + " RNDIS_DEV_DATAINITIALIZED, but now it is %u", + hv->rndis_dev_state); + return 0; + } + + ret = hv_rf_set_packet_filter(hv, 0); + if (!ret) { + PMD_PDEBUG_LOG(hv, DBG_LOAD, "RNDIS device closed"); + hv->rndis_dev_state = RNDIS_DEV_INITIALIZED; + } else + PMD_PDEBUG_LOG(hv, DBG_LOAD, "RNDIS device is left unclosed"); + + return ret; +} + +/* + * RX Flow + */ +int +hyperv_get_buffer(struct hv_data *hv, void *buffer, uint32_t bufferlen) +{ + uint32_t bytes_rxed; + uint64_t request_id; + struct hv_vm_packet_descriptor *desc; + + int ret = hv_vmbus_channel_recv_packet_raw(hv, buffer, bufferlen, + &bytes_rxed, &request_id, 1); + if (likely(ret == 0)) { + if (bytes_rxed) { + desc = (struct hv_vm_packet_descriptor *)buffer; + + if (likely(desc->type == + HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES)) { + hv->pkt_rxed = 0; + hv_nv_on_receive(hv, desc); + return hv->pkt_rxed; + } + } + } + + return 0; +} + +/* + * TX completions handler + */ +void +hyperv_scan_comps(struct hv_data *hv, int allow_rx_drop) +{ + uint32_t bytes_rxed; + uint64_t request_id; + + while (1) { + int ret = hv_vmbus_channel_recv_packet_raw(hv, hv->desc, PAGE_SIZE, + &bytes_rxed, &request_id, 2 | allow_rx_drop); + + if (ret != 0 || !bytes_rxed) + break; + + if (likely(hv->desc->type == HV_VMBUS_PACKET_TYPE_COMPLETION)) + hv_nv_on_send_completion(hv, hv->desc); + } +} + +/* + * Get link status + */ +uint8_t +hyperv_get_link_status(struct hv_data *hv) +{ + if (hv_rf_query_device_link_status(hv)) + return 2; + return hv->link_status; +} + +/* + * Set/Reset RX mode + */ +int +hyperv_set_rx_mode(struct hv_data *hv, uint8_t promisc, uint8_t mcast) +{ + PMD_INIT_FUNC_TRACE(); + + if (!promisc) { + return hv_rf_set_packet_filter(hv, + NDIS_PACKET_TYPE_BROADCAST | + (mcast ? NDIS_PACKET_TYPE_ALL_MULTICAST : 0) | + NDIS_PACKET_TYPE_DIRECTED); + } + + return hv_rf_set_packet_filter(hv, NDIS_PACKET_TYPE_PROMISCUOUS); +} diff --git a/lib/librte_pmd_hyperv/hyperv_drv.h b/lib/librte_pmd_hyperv/hyperv_drv.h new file mode 100644 index 0000000..22acad5 --- /dev/null +++ b/lib/librte_pmd_hyperv/hyperv_drv.h @@ -0,0 +1,558 @@ +/*- + * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2013-2015 Brocade Communications Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _HYPERV_DRV_H_ +#define _HYPERV_DRV_H_ + +/* + * Definitions from hyperv.h + */ +#define HW_MACADDR_LEN 6 +#define HV_MAX_PAGE_BUFFER_COUNT 19 + +#define HV_ALIGN_UP(value, align) \ + (((value) & (align-1)) ? \ + (((value) + (align-1)) & ~(align-1)) : (value)) + +/* + * Connection identifier type + */ +union hv_vmbus_connection_id { + uint32_t as_uint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + } u; + +} __attribute__((packed)); + +union hv_vmbus_monitor_trigger_state { + uint32_t as_uint32_t; + struct { + uint32_t group_enable:4; + uint32_t rsvd_z:28; + } u; +}; + +union hv_vmbus_monitor_trigger_group { + uint64_t as_uint64_t; + struct { + uint32_t pending; + uint32_t armed; + } u; +}; + +struct hv_vmbus_monitor_parameter { + union hv_vmbus_connection_id connection_id; + uint16_t flag_number; + uint16_t rsvd_z; +}; + +/* + * hv_vmbus_monitor_page Layout + * ------------------------------------------------------ + * | 0 | trigger_state (4 bytes) | Rsvd1 (4 bytes) | + * | 8 | trigger_group[0] | + * | 10 | trigger_group[1] | + * | 18 | trigger_group[2] | + * | 20 | trigger_group[3] | + * | 28 | Rsvd2[0] | + * | 30 | Rsvd2[1] | + * | 38 | Rsvd2[2] | + * | 40 | next_check_time[0][0] | next_check_time[0][1] | + * | ... | + * | 240 | latency[0][0..3] | + * | 340 | Rsvz3[0] | + * | 440 | parameter[0][0] | + * | 448 | parameter[0][1] | + * | ... | + * | 840 | Rsvd4[0] | + * ------------------------------------------------------ + */ + +struct hv_vmbus_monitor_page { + union hv_vmbus_monitor_trigger_state trigger_state; + uint32_t rsvd_z1; + + union hv_vmbus_monitor_trigger_group trigger_group[4]; + uint64_t rsvd_z2[3]; + + int32_t next_check_time[4][32]; + + uint16_t latency[4][32]; + uint64_t rsvd_z3[32]; + + struct hv_vmbus_monitor_parameter parameter[4][32]; + + uint8_t rsvd_z4[1984]; +}; + +enum hv_vmbus_packet_type { + HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES = 0x7, + HV_VMBUS_PACKET_TYPE_DATA_USING_GPA_DIRECT = 0x9, + HV_VMBUS_PACKET_TYPE_COMPLETION = 0xb, +}; + +#define HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED 1 + +struct hv_vm_packet_descriptor { + uint16_t type; + uint16_t data_offset8; + uint16_t length8; + uint16_t flags; + uint64_t transaction_id; +} __attribute__((packed)); + +struct hv_vm_transfer_page { + uint32_t byte_count; + uint32_t byte_offset; +} __attribute__((packed)); + +struct hv_vm_transfer_page_packet_header { + struct hv_vm_packet_descriptor d; + uint16_t transfer_page_set_id; + uint8_t sender_owns_set; + uint8_t reserved; + uint32_t range_count; + struct hv_vm_transfer_page ranges[1]; +} __attribute__((packed)); + +struct hv_vmbus_ring_buffer { + volatile uint32_t write_index; + volatile uint32_t read_index; + /* + * NOTE: The interrupt_mask field is used only for channels, but + * vmbus connection also uses this data structure + */ + volatile uint32_t interrupt_mask; + /* pad it to PAGE_SIZE so that data starts on a page */ + uint8_t reserved[4084]; + + /* + * WARNING: Ring data starts here + ring_data_start_offset + * !!! DO NOT place any fields below this !!! + */ + uint8_t buffer[0]; /* doubles as interrupt mask */ +} __attribute__((packed)); + +struct hv_vmbus_page_buffer { + uint32_t length; + uint32_t offset; + uint64_t pfn; +} __attribute__((packed)); + +/* + * Definitions from hv_vmbus_priv.h + */ +struct hv_vmbus_sg_buffer_list { + void *data; + uint32_t length; +}; + +struct hv_vmbus_channel_packet_page_buffer { + uint16_t type; + uint16_t data_offset8; + uint16_t length8; + uint16_t flags; + uint64_t transaction_id; + uint32_t reserved; + uint32_t range_count; + struct hv_vmbus_page_buffer range[HV_MAX_PAGE_BUFFER_COUNT]; +} __attribute__((packed)); + +/* + * Definitions from hv_net_vsc.h + */ +#define NETVSC_PACKET_MAXPAGE 16 +#define NETVSC_PACKET_SIZE 256 + +/* + * This message is used by both the VSP and the VSC to complete + * a RNDIS message to the opposite channel endpoint. At this + * point, the initiator of this message cannot use any resources + * associated with the original RNDIS packet. + */ +enum nvsp_status_ { + nvsp_status_none = 0, + nvsp_status_success, + nvsp_status_failure, +}; + +struct nvsp_1_msg_send_rndis_pkt_complete { + uint32_t status; +} __attribute__((packed)); + +enum nvsp_msg_type { + /* + * Version 1 Messages + */ + nvsp_msg_1_type_send_ndis_vers = 100, + + nvsp_msg_1_type_send_rx_buf, + nvsp_msg_1_type_send_rx_buf_complete, + nvsp_msg_1_type_revoke_rx_buf, + + nvsp_msg_1_type_send_send_buf, + nvsp_msg_1_type_send_send_buf_complete, + nvsp_msg_1_type_revoke_send_buf, + + nvsp_msg_1_type_send_rndis_pkt, + nvsp_msg_1_type_send_rndis_pkt_complete, +}; + +struct nvsp_1_msg_send_rndis_pkt { + /* + * This field is specified by RNDIS. They assume there's + * two different channels of communication. However, + * the Network VSP only has one. Therefore, the channel + * travels with the RNDIS packet. + */ + uint32_t chan_type; + + /* + * This field is used to send part or all of the data + * through a send buffer. This value specifies an + * index into the send buffer. If the index is + * 0xFFFFFFFF, then the send buffer is not being used + * and all of the data was sent through other VMBus + * mechanisms. + */ + uint32_t send_buf_section_idx; + uint32_t send_buf_section_size; +} __attribute__((packed)); + +/* + * ALL Messages + */ +struct nvsp_msg { + uint32_t msg_type; + union { + struct nvsp_1_msg_send_rndis_pkt send_rndis_pkt; + struct nvsp_1_msg_send_rndis_pkt_complete send_rndis_pkt_complete; + /* size is set like in linux kernel driver */ + uint8_t raw[24]; + } msgs; +} __attribute__((packed)); + +#define NETVSC_RECEIVE_BUFFER_ID 0xcafe + +struct netvsc_packet { + uint8_t is_data_pkt; /* One byte */ + uint8_t ext_pages; + uint16_t vlan_tci; + + void *extension; + uint64_t extension_phys_addr; + uint32_t tot_data_buf_len; + uint32_t page_buf_count; + struct hv_vmbus_page_buffer page_buffers[NETVSC_PACKET_MAXPAGE]; +}; + +/* + * Definitions from hv_rndis.h + */ +#define RNDIS_MAJOR_VERSION 0x00000001 +#define RNDIS_MINOR_VERSION 0x00000000 + +#define STATUS_BUFFER_OVERFLOW (0x80000005L) + +/* + * Remote NDIS message types + */ +#define REMOTE_NDIS_PACKET_MSG 0x00000001 +#define REMOTE_NDIS_INITIALIZE_MSG 0x00000002 +#define REMOTE_NDIS_HALT_MSG 0x00000003 +#define REMOTE_NDIS_QUERY_MSG 0x00000004 +#define REMOTE_NDIS_SET_MSG 0x00000005 +#define REMOTE_NDIS_RESET_MSG 0x00000006 +#define REMOTE_NDIS_INDICATE_STATUS_MSG 0x00000007 +#define REMOTE_NDIS_KEEPALIVE_MSG 0x00000008 +/* + * Remote NDIS message completion types + */ +#define REMOTE_NDIS_INITIALIZE_CMPLT 0x80000002 +#define REMOTE_NDIS_QUERY_CMPLT 0x80000004 +#define REMOTE_NDIS_SET_CMPLT 0x80000005 +#define REMOTE_NDIS_RESET_CMPLT 0x80000006 +#define REMOTE_NDIS_KEEPALIVE_CMPLT 0x80000008 + +#define RNDIS_OID_GEN_MEDIA_CONNECT_STATUS 0x00010114 +#define RNDIS_OID_GEN_CURRENT_PACKET_FILTER 0x0001010E +#define RNDIS_OID_802_3_PERMANENT_ADDRESS 0x01010101 +#define RNDIS_OID_802_3_CURRENT_ADDRESS 0x01010102 +#define RNDIS_OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B + +#define RNDIS_CONFIG_PARAM_TYPE_STRING 2 +/* extended info after the RNDIS request message */ +#define RNDIS_EXT_LEN 100 +/* + * Packet extension field contents associated with a Data message. + */ +struct rndis_per_packet_info { + uint32_t size; + uint32_t type; + uint32_t per_packet_info_offset; +}; + +#define ieee_8021q_info 6 + +struct ndis_8021q_info { + union { + struct { + uint32_t user_pri:3; /* User Priority */ + uint32_t cfi:1; /* Canonical Format ID */ + uint32_t vlan_id:12; + uint32_t reserved:16; + } s1; + uint32_t value; + } u1; +}; + +/* Format of Information buffer passed in a SetRequest for the OID */ +/* OID_GEN_RNDIS_CONFIG_PARAMETER. */ +struct rndis_config_parameter_info { + uint32_t parameter_name_offset; + uint32_t parameter_name_length; + uint32_t parameter_type; + uint32_t parameter_value_offset; + uint32_t parameter_value_length; +}; + +/* + * NdisInitialize message + */ +struct rndis_initialize_request { + /* RNDIS request ID */ + uint32_t request_id; + uint32_t major_version; + uint32_t minor_version; + uint32_t max_xfer_size; +}; + +/* + * Response to NdisInitialize + */ +struct rndis_initialize_complete { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; + uint32_t major_version; + uint32_t minor_version; + uint32_t device_flags; + /* RNDIS medium */ + uint32_t medium; + uint32_t max_pkts_per_msg; + uint32_t max_xfer_size; + uint32_t pkt_align_factor; + uint32_t af_list_offset; + uint32_t af_list_size; +}; + +/* + * NdisSetRequest message + */ +struct rndis_set_request { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS OID */ + uint32_t oid; + uint32_t info_buffer_length; + uint32_t info_buffer_offset; + /* RNDIS handle */ + uint32_t device_vc_handle; +}; + +/* + * Response to NdisSetRequest + */ +struct rndis_set_complete { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; +}; + +/* + * NdisQueryRequest message + */ +struct rndis_query_request { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS OID */ + uint32_t oid; + uint32_t info_buffer_length; + uint32_t info_buffer_offset; + /* RNDIS handle */ + uint32_t device_vc_handle; +}; + +/* + * Response to NdisQueryRequest + */ +struct rndis_query_complete { + /* RNDIS request ID */ + uint32_t request_id; + /* RNDIS status */ + uint32_t status; + uint32_t info_buffer_length; + uint32_t info_buffer_offset; +}; + +/* + * Data message. All offset fields contain byte offsets from the beginning + * of the rndis_packet structure. All length fields are in bytes. + * VcHandle is set to 0 for connectionless data, otherwise it + * contains the VC handle. + */ +struct rndis_packet { + uint32_t data_offset; + uint32_t data_length; + uint32_t oob_data_offset; + uint32_t oob_data_length; + uint32_t num_oob_data_elements; + uint32_t per_pkt_info_offset; + uint32_t per_pkt_info_length; + /* RNDIS handle */ + uint32_t vc_handle; + uint32_t reserved; +}; + +/* + * NdisHalt message + */ +struct rndis_halt_request { + /* RNDIS request ID */ + uint32_t request_id; +}; + +/* + * NdisMIndicateStatus message + */ +struct rndis_indicate_status { + /* RNDIS status */ + uint32_t status; + uint32_t status_buf_length; + uint32_t status_buf_offset; +}; + +#define RNDIS_STATUS_MEDIA_CONNECT (0x4001000BL) +#define RNDIS_STATUS_MEDIA_DISCONNECT (0x4001000CL) +#define RNDIS_STATUS_INVALID_DATA (0xC0010015L) + +/* + * union with all of the RNDIS messages + */ +union rndis_msg_container { + struct rndis_initialize_request init_request; + struct rndis_initialize_complete init_complete; + struct rndis_set_request set_request; + struct rndis_set_complete set_complete; + struct rndis_query_request query_request; + struct rndis_query_complete query_complete; + struct rndis_packet packet; + struct rndis_halt_request halt_request; + struct rndis_indicate_status indicate_status; +#if 0 + rndis_keepalive_request keepalive_request; + rndis_reset_request reset_request; + rndis_reset_complete reset_complete; + rndis_keepalive_complete keepalive_complete; + rcondis_mp_create_vc co_miniport_create_vc; + rcondis_mp_delete_vc co_miniport_delete_vc; + rcondis_indicate_status co_miniport_status; + rcondis_mp_activate_vc_request co_miniport_activate_vc; + rcondis_mp_deactivate_vc_request co_miniport_deactivate_vc; + rcondis_mp_create_vc_complete co_miniport_create_vc_complete; + rcondis_mp_delete_vc_complete co_miniport_delete_vc_complete; + rcondis_mp_activate_vc_complete co_miniport_activate_vc_complete; + rcondis_mp_deactivate_vc_complete co_miniport_deactivate_vc_complete; +#endif + uint32_t packet_ex[16]; /* to pad the union size */ +}; + +struct rndis_msg { + uint32_t ndis_msg_type; + + /* + * Total length of this message, from the beginning + * of the rndis_msg struct, in bytes. + */ + uint32_t msg_len; + + /* Actual message */ + union rndis_msg_container msg; +}; + +#define RNDIS_HEADER_SIZE (sizeof(struct rndis_msg) - sizeof(union rndis_msg_container)) + +#define NDIS_PACKET_TYPE_DIRECTED 0x00000001 +#define NDIS_PACKET_TYPE_MULTICAST 0x00000002 +#define NDIS_PACKET_TYPE_ALL_MULTICAST 0x00000004 +#define NDIS_PACKET_TYPE_BROADCAST 0x00000008 +#define NDIS_PACKET_TYPE_SOURCE_ROUTING 0x00000010 +#define NDIS_PACKET_TYPE_PROMISCUOUS 0x00000020 + +/* + * get the size of an RNDIS message. Pass in the message type, + * rndis_set_request, rndis_packet for example + */ +#define RNDIS_MESSAGE_SIZE(message) \ + (sizeof(message) + (sizeof(struct rndis_msg) - sizeof(union rndis_msg_container))) + + +/* + * Definitions from hv_rndis_filter.h + */ +enum { + RNDIS_DEV_UNINITIALIZED = 0, + RNDIS_DEV_INITIALIZING, + RNDIS_DEV_INITIALIZED, + RNDIS_DEV_DATAINITIALIZED, +}; + +struct rndis_request { + /* assumed a fixed size response here. */ + struct rndis_msg response_msg; + + /* Simplify allocation by having a netvsc packet inline */ + struct netvsc_packet pkt; + /* set additional buffer since packet can cross page boundary */ + struct hv_vmbus_page_buffer buffer; + /* assumed a fixed size request here. */ + struct rndis_msg *request_msg; + const struct rte_memzone *request_msg_memzone; +}; + +struct rndis_filter_packet { + struct rndis_msg message; +}; + +#endif /* _HYPERV_DRV_H_ */ diff --git a/lib/librte_pmd_hyperv/hyperv_ethdev.c b/lib/librte_pmd_hyperv/hyperv_ethdev.c new file mode 100644 index 0000000..7b909db --- /dev/null +++ b/lib/librte_pmd_hyperv/hyperv_ethdev.c @@ -0,0 +1,332 @@ +/*- + * Copyright (c) 2013-2015 Brocade Communications Systems, Inc. + * All rights reserved. + */ + +#include +#include +#include "hyperv.h" + +static struct rte_vmbus_id vmbus_id_hyperv_map[] = { + { + .device_id = 0x0, + }, +}; + +static void +hyperv_dev_info_get(__rte_unused struct rte_eth_dev *dev, + struct rte_eth_dev_info *dev_info) +{ + PMD_INIT_FUNC_TRACE(); + dev_info->max_rx_queues = HV_MAX_RX_QUEUES; + dev_info->max_tx_queues = HV_MAX_TX_QUEUES; + dev_info->min_rx_bufsize = HV_MIN_RX_BUF_SIZE; + dev_info->max_rx_pktlen = HV_MAX_RX_PKT_LEN; + dev_info->max_mac_addrs = HV_MAX_MAC_ADDRS; +} + +inline int +rte_hv_dev_atomic_write_link_status(struct rte_eth_dev *dev, + struct rte_eth_link *link) +{ + struct rte_eth_link *dst = &(dev->data->dev_link); + struct rte_eth_link *src = link; + + if (rte_atomic64_cmpset((uint64_t *)dst, *(uint64_t *)dst, + *(uint64_t *)src) == 0) + return -1; + + return 0; +} + +inline int +rte_hv_dev_atomic_read_link_status(struct rte_eth_dev *dev, + struct rte_eth_link *link) +{ + struct rte_eth_link *dst = link; + struct rte_eth_link *src = &(dev->data->dev_link); + + if (rte_atomic64_cmpset((uint64_t *)dst, *(uint64_t *)dst, + *(uint64_t *)src) == 0) + return -1; + + return 0; +} + +/* return 0 means link status changed, -1 means not changed */ +static int +hyperv_dev_link_update(struct rte_eth_dev *dev, + __rte_unused int wait_to_complete) +{ + uint8_t ret; + struct rte_eth_link old, link; + struct hv_data *hv = dev->data->dev_private; + + PMD_INIT_FUNC_TRACE(); + memset(&old, 0, sizeof(old)); + memset(&link, 0, sizeof(link)); + rte_hv_dev_atomic_read_link_status(dev, &old); + if (!hv->link_status && (hv->link_req_cnt == HV_MAX_LINK_REQ)) { + ret = hyperv_get_link_status(hv); + if (ret > 1) + return -1; + hv->link_req_cnt = 0; + } + link.link_duplex = ETH_LINK_FULL_DUPLEX; + link.link_speed = ETH_LINK_SPEED_10000; + link.link_status = hv->link_status; + hv->link_req_cnt++; + rte_hv_dev_atomic_write_link_status(dev, &link); + + return (old.link_status == link.link_status) ? -1 : 0; +} + +static int +hyperv_dev_configure(struct rte_eth_dev *dev) +{ + struct hv_data *hv = dev->data->dev_private; + const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; + + PMD_INIT_FUNC_TRACE(); + + rte_memcpy(dev->data->mac_addrs->addr_bytes, hv->hw_mac_addr, + ETHER_ADDR_LEN); + hv->jumbo_frame_support = rxmode->jumbo_frame; + + return 0; +} + +static int +hyperv_init(struct rte_eth_dev *dev) +{ + struct hv_data *hv = dev->data->dev_private; + struct rte_vmbus_device *vmbus_dev; + + vmbus_dev = dev->vmbus_dev; + hv->uio_fd = vmbus_dev->uio_fd; + hv->kernel_initialized = 1; + hv->vmbus_device = vmbus_dev->id.device_id; + hv->monitor_bit = (uint8_t)(vmbus_dev->vmbus_monitor_id % 32); + hv->monitor_group = (uint8_t)(vmbus_dev->vmbus_monitor_id / 32); + PMD_PDEBUG_LOG(hv, DBG_LOAD, "hyperv_init for vmbus device %d", + vmbus_dev->id.device_id); + + /* get the memory mappings */ + hv->ring_pages = vmbus_dev->mem_resource[TXRX_RING_MAP].addr; + hv->int_page = vmbus_dev->mem_resource[INT_PAGE_MAP].addr; + hv->monitor_pages = + (struct hv_vmbus_monitor_page *) + vmbus_dev->mem_resource[MON_PAGE_MAP].addr; + hv->recv_buf = vmbus_dev->mem_resource[RECV_BUF_MAP].addr; + assert(hv->ring_pages); + assert(hv->int_page); + assert(hv->monitor_pages); + assert(hv->recv_buf); + + /* separate send/recv int_pages */ + hv->recv_interrupt_page = hv->int_page; + + hv->send_interrupt_page = + ((uint8_t *) hv->int_page + (PAGE_SIZE >> 1)); + + /* retrieve in/out ring_buffers */ + hv->out = hv->ring_pages; + hv->in = (void *)((uint64_t)hv->out + + (vmbus_dev->mem_resource[TXRX_RING_MAP].len / 2)); + hv->rb_size = (vmbus_dev->mem_resource[TXRX_RING_MAP].len / 2); + + dev->rx_pkt_burst = hyperv_recv_pkts; + dev->tx_pkt_burst = hyperv_xmit_pkts; + + return hv_rf_on_device_add(hv); +} + +#define HV_DEV_ID (hv->vmbus_device << 1) +#define HV_MTU (dev->data->dev_conf.rxmode.max_rx_pkt_len << 9) + +static int +hyperv_dev_start(struct rte_eth_dev *dev) +{ + int ret; + uint32_t cmd; + size_t bytes; + struct hv_data *hv = dev->data->dev_private; + + PMD_INIT_FUNC_TRACE(); + if (!hv->kernel_initialized) { + cmd = HV_DEV_ID | HV_MTU; + bytes = write(hv->uio_fd, &cmd, sizeof(uint32_t)); + if (bytes < sizeof(uint32_t)) { + PMD_PERROR_LOG(hv, DBG_LOAD, "write on uio_fd %d failed", + hv->uio_fd); + return -1; + } + ret = vmbus_uio_map_resource(dev->vmbus_dev); + if (ret < 0) { + PMD_PERROR_LOG(hv, DBG_LOAD, "Failed to map resources"); + return ret; + } + ret = hyperv_init(dev); + if (ret) + return ret; + } + ret = hv_rf_on_open(hv); + if (ret) { + PMD_PERROR_LOG(hv, DBG_LOAD, "hv_rf_on_open failed"); + return ret; + } + hv->link_req_cnt = HV_MAX_LINK_REQ; + + return ret; +} + +static void +hyperv_dev_stop(struct rte_eth_dev *dev) +{ + struct hv_data *hv = dev->data->dev_private; + uint32_t cmd; + size_t bytes; + + PMD_INIT_FUNC_TRACE(); + if (!hv->closed) { + hv_rf_on_close(hv); + hv_rf_on_device_remove(hv); + if (hv->kernel_initialized) { + cmd = 1 | HV_DEV_ID; + bytes = write(hv->uio_fd, &cmd, sizeof(uint32_t)); + if (bytes) + hv->kernel_initialized = 0; + else + PMD_PWARN_LOG(hv, DBG_LOAD, "write to uio_fd %d failed: (%zu)b", + hv->uio_fd, bytes); + } + hv->link_status = 0; + } +} + +static void +hyperv_dev_close(struct rte_eth_dev *dev) +{ + PMD_INIT_FUNC_TRACE(); + hyperv_dev_stop(dev); +} + +static void +hyperv_dev_promisc_enable(struct rte_eth_dev *dev) +{ + struct hv_data *hv = dev->data->dev_private; + + PMD_INIT_FUNC_TRACE(); + hyperv_set_rx_mode(hv, 1, dev->data->all_multicast); +} + +static void +hyperv_dev_promisc_disable(struct rte_eth_dev *dev) +{ + struct hv_data *hv = dev->data->dev_private; + + PMD_INIT_FUNC_TRACE(); + hyperv_set_rx_mode(hv, 0, dev->data->all_multicast); +} + +static void +hyperv_dev_allmulticast_enable(struct rte_eth_dev *dev) +{ + struct hv_data *hv = dev->data->dev_private; + + PMD_INIT_FUNC_TRACE(); + hyperv_set_rx_mode(hv, dev->data->promiscuous, 1); +} + +static void +hyperv_dev_allmulticast_disable(struct rte_eth_dev *dev) +{ + struct hv_data *hv = dev->data->dev_private; + + PMD_INIT_FUNC_TRACE(); + hyperv_set_rx_mode(hv, dev->data->promiscuous, 0); +} + +static void +hyperv_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) +{ + struct hv_data *hv = dev->data->dev_private; + struct hv_stats *st = &hv->stats; + + PMD_INIT_FUNC_TRACE(); + + memset(stats, 0, sizeof(struct rte_eth_stats)); + + stats->opackets = st->opkts; + stats->obytes = st->obytes; + stats->oerrors = st->oerrors; + stats->ipackets = st->ipkts; + stats->ibytes = st->ibytes; + stats->ierrors = st->ierrors; + stats->rx_nombuf = st->rx_nombuf; +} + +static struct eth_dev_ops hyperv_eth_dev_ops = { + .dev_configure = hyperv_dev_configure, + .dev_start = hyperv_dev_start, + .dev_stop = hyperv_dev_stop, + .dev_infos_get = hyperv_dev_info_get, + .rx_queue_release = hyperv_dev_rx_queue_release, + .tx_queue_release = hyperv_dev_tx_queue_release, + .rx_queue_setup = hyperv_dev_rx_queue_setup, + .tx_queue_setup = hyperv_dev_tx_queue_setup, + .dev_close = hyperv_dev_close, + .promiscuous_enable = hyperv_dev_promisc_enable, + .promiscuous_disable = hyperv_dev_promisc_disable, + .allmulticast_enable = hyperv_dev_allmulticast_enable, + .allmulticast_disable = hyperv_dev_allmulticast_disable, + .link_update = hyperv_dev_link_update, + .stats_get = hyperv_dev_stats_get, +}; + +static int +eth_hyperv_dev_init(struct rte_eth_dev *eth_dev) +{ + int ret; + + PMD_INIT_FUNC_TRACE(); + + eth_dev->dev_ops = &hyperv_eth_dev_ops; + eth_dev->data->mac_addrs = rte_malloc("mac_addrs", + sizeof(struct ether_addr), + RTE_CACHE_LINE_SIZE); + if (!eth_dev->data->mac_addrs) { + PMD_PERROR_LOG(hv, DBG_LOAD, "unable to allocate memory for mac addrs"); + return -1; + } + + ret = hyperv_init(eth_dev); + + return ret; +} + +static struct eth_driver rte_hyperv_pmd = { + .vmbus_drv = { + .name = "rte_hyperv_pmd", + .module_name = "hv_uio", + .id_table = vmbus_id_hyperv_map, + }, + .bus_type = RTE_BUS_VMBUS, + .eth_dev_init = eth_hyperv_dev_init, + .dev_private_size = sizeof(struct hv_data), +}; + +static int +rte_hyperv_pmd_init(const char *name __rte_unused, + const char *param __rte_unused) +{ + rte_eth_driver_register(&rte_hyperv_pmd); + return 0; +} + +static struct rte_driver rte_hyperv_driver = { + .type = PMD_PDEV, + .init = rte_hyperv_pmd_init, +}; + +PMD_REGISTER_DRIVER(rte_hyperv_driver); diff --git a/lib/librte_pmd_hyperv/hyperv_logs.h b/lib/librte_pmd_hyperv/hyperv_logs.h new file mode 100644 index 0000000..1b96468 --- /dev/null +++ b/lib/librte_pmd_hyperv/hyperv_logs.h @@ -0,0 +1,69 @@ +/*- + * Copyright(c) 2013-2015 Brocade Communications Systems, Inc. + * All rights reserved. + */ + +#ifndef _HYPERV_LOGS_H_ +#define _HYPERV_LOGS_H_ + +#ifdef RTE_LIBRTE_HV_DEBUG_INIT +#define PMD_INIT_LOG(level, fmt, args...) \ + RTE_LOG(level, PMD, "%s(): " fmt "\n", __func__, ## args) +#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>") +#else +#define PMD_INIT_LOG(level, fmt, args...) do { } while (0) +#define PMD_INIT_FUNC_TRACE() do { } while (0) +#endif + +#ifdef RTE_LIBRTE_HV_DEBUG + +#define RTE_DBG_LOAD INIT +#define RTE_DBG_STATS STATS +#define RTE_DBG_TX TX +#define RTE_DBG_RX RX +#define RTE_DBG_MBUF MBUF +#define RTE_DBG_ASSERT ASRT +#define RTE_DBG_RB RB +#define RTE_DBG_VMBUS VMBUS +#define RTE_DBG_ALL ALL + +#define STR(x) #x + +#define HV_RTE_LOG(hv, codepath, level, fmt, args...) \ + RTE_LOG(level, PMD, "[%d]: %-6s: %s: " fmt "\n", \ + hv->vmbus_device, STR(codepath), __func__, ## args) + +#define PMD_PDEBUG_LOG(hv, codepath, fmt, args...) \ +do { \ + if (unlikely(hv->debug & (codepath))) \ + HV_RTE_LOG(hv, RTE_##codepath, DEBUG, fmt, ## args) \ +} while (0) + +#define PMD_PINFO_LOG(hv, codepath, fmt, args...) \ +do { \ + if (unlikely(hv->debug & (codepath))) \ + HV_RTE_LOG(hv, RTE_##codepath, INFO, fmt, ## args) \ +} while (0) + +#define PMD_PWARN_LOG(hv, codepath, fmt, args...) \ +do { \ + if (unlikely(hv->debug & (codepath))) \ + HV_RTE_LOG(hv, RTE_##codepath, WARNING, fmt, ## args) \ +} while (0) + +#define PMD_PERROR_LOG(hv, codepath, fmt, args...) \ +do { \ + if (unlikely(hv->debug & (codepath))) \ + HV_RTE_LOG(hv, RTE_##codepath, ERR, fmt, ## args) \ +} while (0) +#else +#define HV_RTE_LOG(level, fmt, args...) do { } while (0) +#define PMD_PDEBUG_LOG(fmt, args...) do { } while (0) +#define PMD_PINFO_LOG(fmt, args...) do { } while (0) +#define PMD_PWARN_LOG(fmt, args...) do { } while (0) +#define PMD_PERROR_LOG(fmt, args...) do { } while (0) +#undef RTE_LIBRTE_HV_DEBUG_TX +#undef RTE_LIBRTE_HV_DEBUG_RX +#endif + +#endif /* _HYPERV_LOGS_H_ */ diff --git a/lib/librte_pmd_hyperv/hyperv_rxtx.c b/lib/librte_pmd_hyperv/hyperv_rxtx.c new file mode 100644 index 0000000..9e423d0 --- /dev/null +++ b/lib/librte_pmd_hyperv/hyperv_rxtx.c @@ -0,0 +1,403 @@ +/*- + * Copyright(c) 2013-2015 Brocade Communications Systems, Inc. + * All rights reserved. + */ + +#include "hyperv.h" +#include "hyperv_rxtx.h" +#include "hyperv_drv.h" + +#define RTE_MBUF_DATA_DMA_ADDR(mb) \ + ((uint64_t)((mb)->buf_physaddr + (mb)->data_off)) + +#define RPPI_SIZE (sizeof(struct rndis_per_packet_info)\ + + sizeof(struct ndis_8021q_info)) +#define RNDIS_OFF (sizeof(struct netvsc_packet) + RPPI_SIZE) +#define TX_PKT_SIZE (RNDIS_OFF + sizeof(struct rndis_filter_packet) * 2) + +static inline struct rte_mbuf * +hv_rxmbuf_alloc(struct rte_mempool *mp) +{ + return __rte_mbuf_raw_alloc(mp); +} + +static inline int +hyperv_has_rx_work(struct hv_data *hv) +{ + return hv->in->read_index != hv->in->write_index; +} + +#ifndef DEFAULT_TX_FREE_THRESHOLD +#define DEFAULT_TX_FREE_THRESHOLD 32 +#endif + +int +hyperv_dev_tx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_idx, + uint16_t nb_desc, + unsigned int socket_id, + const struct rte_eth_txconf *tx_conf) + +{ + struct hv_data *hv = dev->data->dev_private; + const struct rte_memzone *tz; + struct hv_tx_queue *txq; + char tz_name[RTE_MEMZONE_NAMESIZE]; + uint32_t i, delta = 0, new_delta; + struct netvsc_packet *pkt; + + PMD_INIT_FUNC_TRACE(); + + txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct hv_tx_queue), + RTE_CACHE_LINE_SIZE, socket_id); + if (txq == NULL) { + PMD_PERROR_LOG(hv, DBG_LOAD, "rte_zmalloc for tx_queue failed"); + return -ENOMEM; + } + + if (tx_conf->tx_free_thresh >= nb_desc) { + PMD_PERROR_LOG(hv, DBG_LOAD, + "tx_free_thresh should be less then nb_desc"); + return -EINVAL; + } + txq->tx_free_thresh = (tx_conf->tx_free_thresh ? tx_conf->tx_free_thresh : + DEFAULT_TX_FREE_THRESHOLD); + txq->pkts = rte_calloc_socket("TX pkts", sizeof(void*), nb_desc, + RTE_CACHE_LINE_SIZE, socket_id); + if (txq->pkts == NULL) { + PMD_PERROR_LOG(hv, DBG_LOAD, "rte_zmalloc for pkts failed"); + return -ENOMEM; + } + sprintf(tz_name, "hv_%d_%u_%u", hv->vmbus_device, queue_idx, socket_id); + tz = rte_memzone_reserve_aligned(tz_name, + (uint32_t)nb_desc * TX_PKT_SIZE, + rte_lcore_to_socket_id(rte_lcore_id()), + 0, PAGE_SIZE); + if (tz == NULL) { + PMD_PERROR_LOG(hv, DBG_LOAD, "netvsc packet ring alloc fail"); + return -ENOMEM; + } + for (i = 0; i < nb_desc; i++) { + pkt = txq->pkts[i] = (struct netvsc_packet *)((uint8_t *)tz->addr + + i * TX_PKT_SIZE + delta); + pkt->extension = (uint8_t *)tz->addr + i * TX_PKT_SIZE + RNDIS_OFF + delta; + if (!pkt->extension) { + PMD_PERROR_LOG(hv, DBG_TX, + "pkt->extension is NULL for %d-th pkt", i); + return -EINVAL; + } + pkt->extension_phys_addr = + tz->phys_addr + i * TX_PKT_SIZE + RNDIS_OFF + delta; + pkt->ext_pages = 1; + pkt->page_buffers[0].pfn = pkt->extension_phys_addr >> PAGE_SHIFT; + pkt->page_buffers[0].offset = + (unsigned long)pkt->extension & (PAGE_SIZE - 1); + pkt->page_buffers[0].length = RNDIS_MESSAGE_SIZE(struct rndis_packet); + if (pkt->page_buffers[0].offset + pkt->page_buffers[0].length + > PAGE_SIZE) { + new_delta = PAGE_SIZE - pkt->page_buffers[0].offset; + pkt->page_buffers[0].pfn++; + delta += new_delta; + pkt->page_buffers[0].offset = 0; + pkt->extension = (uint8_t *)pkt->extension + new_delta; + pkt->extension_phys_addr += new_delta; + } + } + txq->sw_ring = rte_calloc_socket("txq_sw_ring", + sizeof(struct rte_mbuf *), nb_desc, + RTE_CACHE_LINE_SIZE, socket_id); + if (txq->sw_ring == NULL) { + hyperv_dev_tx_queue_release(txq); + return -ENOMEM; + } + txq->port_id = dev->data->port_id; + txq->nb_tx_desc = txq->tx_avail = nb_desc; + txq->tx_free_thresh = tx_conf->tx_free_thresh; + txq->hv = hv; + dev->data->tx_queues[queue_idx] = txq; + hv->txq = txq; + + return 0; +} + +void +hyperv_dev_tx_queue_release(void *ptxq) +{ + struct hv_tx_queue *txq = ptxq; + + PMD_INIT_FUNC_TRACE(); + if (txq == NULL) + return; + rte_free(txq->sw_ring); + rte_free(txq->pkts); + rte_free(txq); +} + +int +hyperv_dev_rx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_idx, + uint16_t nb_desc, + unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf, + struct rte_mempool *mp) +{ + uint16_t i; + struct hv_rx_queue *rxq; + struct rte_mbuf *mbuf; + struct hv_data *hv = dev->data->dev_private; + + PMD_INIT_FUNC_TRACE(); + + rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct hv_rx_queue), + RTE_CACHE_LINE_SIZE, socket_id); + if (rxq == NULL) { + PMD_PERROR_LOG(hv, DBG_LOAD, + "rte_zmalloc for rx_queue failed!"); + return -ENOMEM; + } + hv->desc = rxq->desc = rte_zmalloc_socket(NULL, PAGE_SIZE, + RTE_CACHE_LINE_SIZE, socket_id); + if (rxq->desc == NULL) { + PMD_PERROR_LOG(hv, DBG_LOAD, + "rte_zmalloc for vmbus_desc failed!"); + hyperv_dev_rx_queue_release(rxq); + return -ENOMEM; + } + rxq->sw_ring = rte_calloc_socket("rxq->sw_ring", + sizeof(struct mbuf *), nb_desc, + RTE_CACHE_LINE_SIZE, socket_id); + if (rxq->sw_ring == NULL) { + hyperv_dev_rx_queue_release(rxq); + return -ENOMEM; + } + + for (i = 0; i < nb_desc; i++) { + mbuf = hv_rxmbuf_alloc(mp); + if (mbuf == NULL) { + PMD_PERROR_LOG(hv, DBG_LOAD, "RX mbuf alloc failed"); + return -ENOMEM; + } + + mbuf->nb_segs = 1; + mbuf->next = NULL; + mbuf->port = rxq->port_id; + rxq->sw_ring[i] = mbuf; + } + + rxq->mb_pool = mp; + rxq->nb_rx_desc = nb_desc; + rxq->rx_head = 0; + rxq->rx_tail = 0; + rxq->rx_free_thresh = rx_conf->rx_free_thresh; + rxq->port_id = dev->data->port_id; + rxq->hv = hv; + dev->data->rx_queues[queue_idx] = rxq; + hv->rxq = rxq; + hv->max_rx_pkt_len = mp->elt_size - + (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM); + + return 0; +} + +void +hyperv_dev_rx_queue_release(void *prxq) +{ + struct hv_rx_queue *rxq = prxq; + + PMD_INIT_FUNC_TRACE(); + if (rxq == NULL) + return; + rte_free(rxq->sw_ring); + rte_free(rxq->desc); + rte_free(rxq); +} + +uint16_t +hyperv_recv_pkts(void *prxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct hv_rx_queue *rxq = prxq; + struct hv_data *hv = rxq->hv; + struct rte_mbuf *new_mb, *rx_mbuf, *first_mbuf; + uint16_t nb_rx = 0; + uint16_t segs, i; + + if (unlikely(hv->closed)) + return 0; + + nb_pkts = MIN(nb_pkts, HV_MAX_PKT_BURST); + hyperv_scan_comps(hv, 0); + + while (nb_rx < nb_pkts) { + /* + * if there are no mbufs in sw_ring, + * we need to trigger receive procedure + */ + if (rxq->rx_head == rxq->rx_tail) { + if (!hyperv_has_rx_work(hv)) + break; + + if (unlikely(!hyperv_get_buffer(hv, rxq->desc, PAGE_SIZE))) { + hyperv_scan_comps(hv, 0); + continue; + } + } + + /* + * Now the received data is in sw_ring of our rxq + * we need to extract it and replace in sw_ring with new mbuf + */ + rx_mbuf = first_mbuf = rxq->sw_ring[rxq->rx_head]; + segs = first_mbuf->nb_segs; + for (i = 0; i < segs; ++i) { + new_mb = hv_rxmbuf_alloc(rxq->mb_pool); + if (unlikely(!new_mb)) { + PMD_PERROR_LOG(hv, DBG_RX, "mbuf alloc fail"); + ++hv->stats.rx_nombuf; + return nb_rx; + } + + rx_mbuf = rxq->sw_ring[rxq->rx_head]; + rxq->sw_ring[rxq->rx_head] = new_mb; + + if (++rxq->rx_head == rxq->nb_rx_desc) + rxq->rx_head = 0; + + rx_mbuf->ol_flags |= PKT_RX_IPV4_HDR; + rx_mbuf->port = rxq->port_id; + } + rx_mbuf->next = NULL; + + rx_pkts[nb_rx++] = first_mbuf; + ++hv->stats.ipkts; + hv->stats.ibytes += first_mbuf->pkt_len; + } + + return nb_rx; +} + +static void hyperv_txeof(struct hv_tx_queue *txq) +{ + struct rte_mbuf *mb, *mb_next; + + txq->tx_avail += txq->tx_free; + while (txq->tx_free) { + --txq->tx_free; + mb = txq->sw_ring[txq->tx_head]; + while (mb) { + mb_next = mb->next; + rte_mempool_put(mb->pool, mb); + mb = mb_next; + } + if (++txq->tx_head == txq->nb_tx_desc) + txq->tx_head = 0; + } +} + +uint16_t +hyperv_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + struct hv_tx_queue *txq = ptxq; + struct hv_data *hv = txq->hv; + struct netvsc_packet *packet; + struct rte_mbuf *m; + uint32_t data_pages; + uint64_t first_data_page; + uint32_t total_len; + uint32_t len; + uint16_t i, nb_tx; + uint8_t rndis_pages; + int ret; + + if (unlikely(hv->closed)) + return 0; + + for (nb_tx = 0; nb_tx < nb_pkts; ++nb_tx) { + hyperv_scan_comps(hv, 0); + /* Determine if the descriptor ring needs to be cleaned. */ + if (txq->tx_free > txq->tx_free_thresh) + hyperv_txeof(txq); + + if (!txq->tx_avail) { + hyperv_scan_comps(hv, 1); + hyperv_txeof(txq); + if (!txq->tx_avail) { + PMD_PWARN_LOG(hv, DBG_TX, "No TX mbuf available"); + break; + } + } + m = tx_pkts[nb_tx]; + len = m->data_len; + total_len = m->pkt_len; + first_data_page = RTE_MBUF_DATA_DMA_ADDR(m) >> PAGE_SHIFT; + data_pages = ((RTE_MBUF_DATA_DMA_ADDR(m) + len - 1) >> PAGE_SHIFT) - + first_data_page + 1; + + packet = txq->pkts[txq->tx_tail]; + rndis_pages = packet->ext_pages; + + txq->sw_ring[txq->tx_tail] = m; + packet->tot_data_buf_len = total_len; + packet->page_buffers[rndis_pages].pfn = + RTE_MBUF_DATA_DMA_ADDR(m) >> PAGE_SHIFT; + packet->page_buffers[rndis_pages].offset = + RTE_MBUF_DATA_DMA_ADDR(m) & (PAGE_SIZE - 1); + if (data_pages == 1) + packet->page_buffers[rndis_pages].length = len; + else + packet->page_buffers[rndis_pages].length = PAGE_SIZE - + packet->page_buffers[rndis_pages].offset; + + for (i = 1; i < data_pages; ++i) { + packet->page_buffers[rndis_pages + i].pfn = first_data_page + i; + packet->page_buffers[rndis_pages + i].offset = 0; + packet->page_buffers[rndis_pages + i].length = PAGE_SIZE; + } + if (data_pages > 1) + packet->page_buffers[rndis_pages - 1 + data_pages].length = + ((rte_pktmbuf_mtod(m, unsigned long) + len - 1) + & (PAGE_SIZE - 1)) + 1; + + uint16_t index = data_pages + rndis_pages; + + for (i = 1; i < m->nb_segs; ++i) { + m = m->next; + len = m->data_len; + first_data_page = RTE_MBUF_DATA_DMA_ADDR(m) >> PAGE_SHIFT; + data_pages = ((RTE_MBUF_DATA_DMA_ADDR(m) + len - 1) >> PAGE_SHIFT) - + first_data_page + 1; + packet->page_buffers[index].pfn = + RTE_MBUF_DATA_DMA_ADDR(m) >> PAGE_SHIFT; + packet->page_buffers[index].offset = + rte_pktmbuf_mtod(m, unsigned long) + & (PAGE_SIZE - 1); + packet->page_buffers[index].length = m->data_len; + if (data_pages > 1) { + /* It can be 2 in case of usual mbuf_size=2048 */ + packet->page_buffers[index].length = PAGE_SIZE - + packet->page_buffers[index].offset; + packet->page_buffers[++index].offset = 0; + packet->page_buffers[index].pfn = + packet->page_buffers[index - 1].pfn + 1; + packet->page_buffers[index].length = + m->data_len + - packet->page_buffers[index - 1].length; + } + ++index; + } + packet->page_buf_count = index; + + ret = hv_rf_on_send(hv, packet); + if (likely(ret == 0)) { + ++hv->stats.opkts; + hv->stats.obytes += total_len; + if (++txq->tx_tail == txq->nb_tx_desc) + txq->tx_tail = 0; + --txq->tx_avail; + } else { + ++hv->stats.oerrors; + PMD_PERROR_LOG(hv, DBG_TX, "TX ring buffer is busy"); + } + } + + return nb_tx; +} diff --git a/lib/librte_pmd_hyperv/hyperv_rxtx.h b/lib/librte_pmd_hyperv/hyperv_rxtx.h new file mode 100644 index 0000000..c45a704 --- /dev/null +++ b/lib/librte_pmd_hyperv/hyperv_rxtx.h @@ -0,0 +1,35 @@ +/*- + * Copyright(c) 2013-2015 Brocade Communications Systems, Inc. + * All rights reserved. + */ + +/** + * Structure associated with each TX queue. + */ +struct hv_tx_queue { + struct netvsc_packet **pkts; + struct rte_mbuf **sw_ring; + uint16_t nb_tx_desc; + uint16_t tx_avail; + uint16_t tx_head; + uint16_t tx_tail; + uint16_t tx_free_thresh; + uint16_t tx_free; + uint8_t port_id; + struct hv_data *hv; +} __rte_cache_aligned; + +/** + * Structure associated with each RX queue. + */ +struct hv_rx_queue { + struct rte_mempool *mb_pool; + struct rte_mbuf **sw_ring; + uint16_t nb_rx_desc; + uint16_t rx_head; + uint16_t rx_tail; + uint16_t rx_free_thresh; + uint8_t port_id; + struct hv_data *hv; + struct hv_vm_packet_descriptor *desc; +} __rte_cache_aligned; diff --git a/mk/rte.app.mk b/mk/rte.app.mk index 62a76ae..e0416d1 100644 --- a/mk/rte.app.mk +++ b/mk/rte.app.mk @@ -133,6 +133,10 @@ LDLIBS += -lm LDLIBS += -lrt endif +ifeq ($(CONFIG_RTE_LIBRTE_HV_PMD),y) +LDLIBS += -lrte_pmd_hyperv +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST), y) LDLIBS += -lrte_vhost endif -- 2.1.4