From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mx1.redhat.com (mx1.redhat.com [209.132.183.28]) by dpdk.org (Postfix) with ESMTP id A56CB5320 for ; Wed, 29 Jan 2014 11:00:19 +0100 (CET) Received: from int-mx09.intmail.prod.int.phx2.redhat.com (int-mx09.intmail.prod.int.phx2.redhat.com [10.5.11.22]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id s0TA1aet026695 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Wed, 29 Jan 2014 05:01:36 -0500 Received: from lsx.localdomain (vpn1-5-163.ams2.redhat.com [10.36.5.163]) by int-mx09.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id s0TA1W0k023895; Wed, 29 Jan 2014 05:01:33 -0500 Message-ID: <52E8D17C.8050100@redhat.com> Date: Wed, 29 Jan 2014 11:01:32 +0100 From: Thomas Graf Organization: Red Hat, Inc. User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.2.0 MIME-Version: 1.0 To: pshelar@nicira.com, dev@openvswitch.org, dev@dpdk.org, dpdk-ovs@ml01.01.org References: <1390873715-26714-1-git-send-email-pshelar@nicira.com> In-Reply-To: <1390873715-26714-1-git-send-email-pshelar@nicira.com> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 8bit X-Scanned-By: MIMEDefang 2.68 on 10.5.11.22 Cc: Gerald Rogers Subject: Re: [dpdk-dev] [ovs-dev] [PATCH RFC] dpif-netdev: Add support Intel DPDK based ports. X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 29 Jan 2014 10:00:20 -0000 On 01/28/2014 02:48 AM, pshelar@nicira.com wrote: > From: Pravin B Shelar > > Following patch adds DPDK netdev-class to userspace datapath. > Approach taken in this patch differs from IntelĀ® DPDK vSwitch > where DPDK datapath switching is done in saparate process. This > patch adds support for DPDK type port and uses OVS userspace > datapath for switching. Therefore all DPDK processing and flow > miss handling is done in single process. This also avoids code > duplication by reusing OVS userspace datapath switching and > therefore it supports all flow matching and actions that > user-space datapath supports. Refer to INSTALL.DPDK doc for > further info. > > With this patch I got similar performance for netperf TCP_STREAM > tests compared to kernel datapath. > > This is based a patch from Gerald Rogers. > > Signed-off-by: Pravin B Shelar > CC: "Gerald Rogers" Pravin, Some initial comments below. I will provide more after deeper digging. Do you have any ideas on how to implement the TX batching yet? > + > +static int > +netdev_dpdk_rx_drain(struct netdev_rx *rx_) > +{ > + struct netdev_rx_dpdk *rx = netdev_rx_dpdk_cast(rx_); > + int pending; > + int i; > + > + pending = rx->ofpbuf_cnt; > + if (pending) { This conditional seems unneeded. > + for (i = 0; i < pending; i++) { > + build_ofpbuf(rx, &rx->ofpbuf[i], NULL); > + } > + rx->ofpbuf_cnt = 0; > + return 0; > + } > + > + return 0; > +} > + > +/* Tx function. Transmit packets indefinitely */ > +static int > +dpdk_do_tx_copy(struct netdev *netdev, char *buf, int size) > +{ > + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > + struct rte_mbuf *pkt; > + uint32_t nb_tx = 0; > + > + pkt = rte_pktmbuf_alloc(dev->dpdk_mp->mp); > + if (!pkt) { > + return 0; Silent drop? ;-) Shouldn't these drops be accounted for somehow? > + } > + > + /* We have to do a copy for now */ > + memcpy(pkt->pkt.data, buf, size); > + > + rte_pktmbuf_data_len(pkt) = size; > + rte_pktmbuf_pkt_len(pkt) = size; > + > + rte_spinlock_lock(&dev->tx_lock); What is the purpose of tx_lock here? Multiple threads writing to the same Q? The lock is not acquired for the zerocopy path below. > + nb_tx = rte_eth_tx_burst(dev->port_id, NR_QUEUE, &pkt, 1); > + rte_spinlock_unlock(&dev->tx_lock); > + > + if (nb_tx != 1) { > + /* free buffers if we couldn't transmit packets */ > + rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1); > + } > + return nb_tx; > +} > + > +static int > +netdev_dpdk_send(struct netdev *netdev, > + struct ofpbuf *ofpbuf, bool may_steal) > +{ > + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > + > + if (ofpbuf->size > dev->max_packet_len) { > + VLOG_ERR("2big size %d max_packet_len %d", > + (int)ofpbuf->size , dev->max_packet_len); Should probably use VLOG_RATE_LIMIT_INIT > + return E2BIG; > + } > + > + rte_prefetch0(&ofpbuf->private_p); > + if (!may_steal || > + !ofpbuf->private_p || ofpbuf->source != OFPBUF_DPDK) { > + dpdk_do_tx_copy(netdev, (char *) ofpbuf->data, ofpbuf->size); > + } else { > + struct rte_mbuf *pkt; > + uint32_t nb_tx; > + int qid; > + > + pkt = ofpbuf->private_p; > + ofpbuf->private_p = NULL; > + rte_pktmbuf_data_len(pkt) = ofpbuf->size; > + rte_pktmbuf_pkt_len(pkt) = ofpbuf->size; > + > + /* TODO: TX batching. */ > + qid = rte_lcore_id() % NR_QUEUE; > + nb_tx = rte_eth_tx_burst(dev->port_id, qid, &pkt, 1); > + if (nb_tx != 1) { > + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > + > + rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1); > + VLOG_ERR("TX error, zero packets sent"); Same here > + } > + } > + return 0; > +} > +static int > +netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu) > +{ > + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > + int old_mtu, err; > + struct dpdk_mp *old_mp; > + struct dpdk_mp *mp; > + > + ovs_mutex_lock(&dpdk_mutex); > + ovs_mutex_lock(&dev->mutex); > + if (dev->mtu == mtu) { > + err = 0; > + goto out; > + } > + > + mp = dpdk_mp_get(dev->socket_id, dev->mtu); > + if (!mp) { > + err = ENOMEM; > + goto out; > + } > + > + rte_eth_dev_stop(dev->port_id); > + > + old_mtu = dev->mtu; > + old_mp = dev->dpdk_mp; > + dev->dpdk_mp = mp; > + dev->mtu = mtu; > + dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu); > + > + err = dpdk_eth_dev_init(dev); > + if (err) { > + > + dpdk_mp_put(mp); > + dev->mtu = old_mtu; > + dev->dpdk_mp = old_mp; > + dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu); > + dpdk_eth_dev_init(dev); Would be nice if we don't need these constructs and DPDK would provide an all or nothing init method. > + goto out; > + } > + > + dpdk_mp_put(old_mp); > +out: > + ovs_mutex_unlock(&dev->mutex); > + ovs_mutex_unlock(&dpdk_mutex); > + return err; > +} > + > +static int > +netdev_dpdk_update_flags__(struct netdev_dpdk *dev, > + enum netdev_flags off, enum netdev_flags on, > + enum netdev_flags *old_flagsp) > + OVS_REQUIRES(dev->mutex) > +{ > + int err; > + > + if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) { > + return EINVAL; > + } > + > + *old_flagsp = dev->flags; > + dev->flags |= on; > + dev->flags &= ~off; > + > + if (dev->flags == *old_flagsp) { > + return 0; > + } > + > + rte_eth_dev_stop(dev->port_id); > + > + if (dev->flags & NETDEV_UP) { > + err = rte_eth_dev_start(dev->port_id); > + if (err) > + return err; > + } I'm not a DPDK expert but is it required to restart the device to change promisc settings or could we conditionally start and stop based on the previous flags state? > + > + if (dev->flags & NETDEV_PROMISC) { > + rte_eth_promiscuous_enable(dev->port_id); > + rte_eth_allmulticast_enable(dev->port_id); > + } > + > + return 0; > +} > > + > +static void > +netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc, > + const char *argv[], void *aux OVS_UNUSED) > +{ > + bool up; > + > + if (!strcasecmp(argv[argc - 1], "up")) { > + up = true; > + } else if ( !strcasecmp(argv[argc - 1], "down")) { > + up = false; > + } else { > + unixctl_command_reply_error(conn, "Invalid Admin State"); > + return; > + } > + > + if (argc > 2) { > + struct netdev *netdev = netdev_from_name(argv[1]); For future refinement: Usability would be increased if either a strict one interface argument is enforced or multiple interface names could be passed in, e.g. set-admin-state dpdk0 dpdk1 up or set-admin-state dpdk0 up dpdk1 up As of now, dpdk1 is silently ignored which is not nice. > + if (netdev && is_dpdk_class(netdev->netdev_class)) { > + struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev); > + > + ovs_mutex_lock(&dpdk_dev->mutex); > + netdev_dpdk_set_admin_state__(dpdk_dev, up); > + ovs_mutex_unlock(&dpdk_dev->mutex); > + > + netdev_close(netdev); > + } else { > + unixctl_command_reply_error(conn, "Unknown Dummy Interface"); I think this should read "Not a DPDK Interface" or something similar. > + netdev_close(netdev); > + return; > + } > + } else { > + struct netdev_dpdk *netdev; > + > + ovs_mutex_lock(&dpdk_mutex); > + LIST_FOR_EACH (netdev, list_node, &dpdk_list) { > + ovs_mutex_lock(&netdev->mutex); > + netdev_dpdk_set_admin_state__(netdev, up); > + ovs_mutex_unlock(&netdev->mutex); > + } > + ovs_mutex_unlock(&dpdk_mutex); > + } > + unixctl_command_reply(conn, "OK"); > +} > + > + > - retval = rx->netdev->netdev_class->rx_recv(rx, buffer); > - if (!retval) { > - COVERAGE_INC(netdev_received); Are you removing the netdev_receive counter on purpose here?