From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by dpdk.space (Postfix) with ESMTP id EF6B6A05D3 for ; Sun, 24 Mar 2019 10:37:05 +0100 (CET) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id C83211B864; Sun, 24 Mar 2019 10:37:04 +0100 (CET) Received: from mga18.intel.com (mga18.intel.com [134.134.136.126]) by dpdk.org (Postfix) with ESMTP id 7CB861B857 for ; Sun, 24 Mar 2019 10:37:02 +0100 (CET) X-Amp-Result: UNSCANNABLE X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga106.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 24 Mar 2019 02:37:01 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.60,256,1549958400"; d="scan'208";a="285416879" Received: from yexl-server.sh.intel.com (HELO localhost) ([10.67.110.206]) by orsmga004.jf.intel.com with ESMTP; 24 Mar 2019 02:37:00 -0700 Date: Sun, 24 Mar 2019 17:32:47 +0800 From: Ye Xiaolong To: Maxime Coquelin Cc: dev@dpdk.org, Qi Zhang , Karlsson Magnus , Topel Bjorn Message-ID: <20190324093247.GC22909@intel.com> References: <20190301080947.91086-1-xiaolong.ye@intel.com> <20190322130129.109964-1-xiaolong.ye@intel.com> <20190322130129.109964-2-xiaolong.ye@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.9.4 (2018-02-28) Subject: Re: [dpdk-dev] [PATCH v4 1/5] net/af_xdp: introduce AF XDP PMD driver X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Message-ID: <20190324093247.A90dM-x2w4YKvHy2tAHvYHs2dv5gx0H0DP0BAS5pGts@z> On 03/22, Maxime Coquelin wrote: > > >On 3/22/19 2:01 PM, Xiaolong Ye wrote: >> Add a new PMD driver for AF_XDP which is a proposed faster version of >> AF_PACKET interface in Linux. More info about AF_XDP, please refer to [1] >> [2]. >> >> This is the vanilla version PMD which just uses a raw buffer registered as >> the umem. >> >> [1] https://fosdem.org/2018/schedule/event/af_xdp/ >> [2] https://lwn.net/Articles/745934/ >> >> Signed-off-by: Xiaolong Ye >> --- >> MAINTAINERS | 6 + >> config/common_base | 5 + >> config/common_linux | 1 + >> doc/guides/nics/af_xdp.rst | 45 + >> doc/guides/nics/features/af_xdp.ini | 11 + >> doc/guides/nics/index.rst | 1 + >> doc/guides/rel_notes/release_19_05.rst | 7 + >> drivers/net/Makefile | 1 + >> drivers/net/af_xdp/Makefile | 32 + >> drivers/net/af_xdp/meson.build | 21 + >> drivers/net/af_xdp/rte_eth_af_xdp.c | 940 ++++++++++++++++++ >> drivers/net/af_xdp/rte_pmd_af_xdp_version.map | 3 + >> drivers/net/meson.build | 1 + >> mk/rte.app.mk | 1 + >> 14 files changed, 1075 insertions(+) >> create mode 100644 doc/guides/nics/af_xdp.rst >> create mode 100644 doc/guides/nics/features/af_xdp.ini >> create mode 100644 drivers/net/af_xdp/Makefile >> create mode 100644 drivers/net/af_xdp/meson.build >> create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c >> create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map >> > >... > >> diff --git a/config/common_base b/config/common_base >> index 0b09a9348..4044de205 100644 >> --- a/config/common_base >> +++ b/config/common_base >> @@ -416,6 +416,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n >> # >> CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n >> +# >> +# Compile software PMD backed by AF_XDP sockets (Linux only) >> +# >> +CONFIG_RTE_LIBRTE_PMD_AF_XDP=n >> + >> # >> # Compile link bonding PMD library >> # >> diff --git a/config/common_linux b/config/common_linux >> index 75334273d..0b1249da0 100644 >> --- a/config/common_linux >> +++ b/config/common_linux >> @@ -19,6 +19,7 @@ CONFIG_RTE_LIBRTE_VHOST_POSTCOPY=n >> CONFIG_RTE_LIBRTE_PMD_VHOST=y >> CONFIG_RTE_LIBRTE_IFC_PMD=y >> CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y >> +CONFIG_RTE_LIBRTE_PMD_AF_XDP=y > >It has to be disabled by default as it requires headers from kernels >more recent than minimum kernel version supported by DPDK. Ok, got it. > >> CONFIG_RTE_LIBRTE_PMD_SOFTNIC=y >> CONFIG_RTE_LIBRTE_PMD_TAP=y >> CONFIG_RTE_LIBRTE_AVP_PMD=y > >... > >> diff --git a/drivers/net/af_xdp/meson.build b/drivers/net/af_xdp/meson.build >> new file mode 100644 >> index 000000000..635e67483 >> --- /dev/null >> +++ b/drivers/net/af_xdp/meson.build >> @@ -0,0 +1,21 @@ >> +# SPDX-License-Identifier: BSD-3-Clause >> +# Copyright(c) 2018 Intel Corporation >> + >> +if host_machine.system() != 'linux' >> + build = false >> +endif >> + >> +bpf_dep = dependency('libbpf', required: false) >> +if bpf_dep.found() >> + build = true >> +else >> + bpf_dep = cc.find_library('libbpf', required: false) >> + if bpf_dep.found() and cc.has_header('xsk.h', dependencies: bpf_dep) >> + build = true >> + pkgconfig_extra_libs += '-lbpf' >> + else >> + build = false >> + endif >> +endif > >I think you need to add more checks, as above does not cover >linux/if_xdp.h header IIUC. will add more. > >> +sources = files('rte_eth_af_xdp.c') >> +ext_deps += bpf_dep >> diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c >> new file mode 100644 >> index 000000000..9f0012347 >> --- /dev/null >> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c >> @@ -0,0 +1,940 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause >> + * Copyright(c) 2019 Intel Corporation. >> + */ >> + >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> + >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> + >> +#ifndef SOL_XDP >> +#define SOL_XDP 283 >> +#endif >> + >> +#ifndef AF_XDP >> +#define AF_XDP 44 >> +#endif >> + >> +#ifndef PF_XDP >> +#define PF_XDP AF_XDP >> +#endif >> + >> +static int af_xdp_logtype; >> + >> +#define AF_XDP_LOG(level, fmt, args...) \ >> + rte_log(RTE_LOG_ ## level, af_xdp_logtype, \ >> + "%s(): " fmt "\n", __func__, ##args) >> + >> +#define ETH_AF_XDP_IFACE_ARG "iface" >> +#define ETH_AF_XDP_QUEUE_IDX_ARG "queue" >> + >> +#define ETH_AF_XDP_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE >> +#define ETH_AF_XDP_NUM_BUFFERS 4096 >> +#define ETH_AF_XDP_DATA_HEADROOM 0 >> +#define ETH_AF_XDP_DFLT_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS >> +#define ETH_AF_XDP_DFLT_QUEUE_IDX 0 >> + >> +#define ETH_AF_XDP_RX_BATCH_SIZE 32 >> +#define ETH_AF_XDP_TX_BATCH_SIZE 32 >> + >> +#define ETH_AF_XDP_MAX_QUEUE_PAIRS 16 >> + >> +struct xsk_umem_info { >> + struct xsk_ring_prod fq; >> + struct xsk_ring_cons cq; >> + struct xsk_umem *umem; >> + struct rte_ring *buf_ring; >> + void *buffer; >> +}; >> + >> +struct rx_stats { >> + uint64_t rx_pkts; >> + uint64_t rx_bytes; >> + uint64_t rx_dropped; >> +}; >> + >> +struct pkt_rx_queue { >> + struct xsk_ring_cons rx; >> + struct xsk_umem_info *umem; >> + struct xsk_socket *xsk; >> + struct rte_mempool *mb_pool; >> + >> + struct rx_stats stats; >> + >> + struct pkt_tx_queue *pair; >> + uint16_t queue_idx; >> +}; >> + >> +struct tx_stats { >> + uint64_t tx_pkts; >> + uint64_t err_pkts; >> + uint64_t tx_bytes; >> +}; >> + >> +struct pkt_tx_queue { >> + struct xsk_ring_prod tx; >> + >> + struct tx_stats stats; >> + >> + struct pkt_rx_queue *pair; >> + uint16_t queue_idx; >> +}; >> + >> +struct pmd_internals { >> + int if_index; >> + char if_name[IFNAMSIZ]; >> + uint16_t queue_idx; >> + struct ether_addr eth_addr; >> + struct xsk_umem_info *umem; >> + struct rte_mempool *mb_pool_share; >> + >> + struct pkt_rx_queue rx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS]; >> + struct pkt_tx_queue tx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS]; >> +}; >> + >> +static const char * const valid_arguments[] = { >> + ETH_AF_XDP_IFACE_ARG, >> + ETH_AF_XDP_QUEUE_IDX_ARG, >> + NULL >> +}; >> + >> +static struct rte_eth_link pmd_link = { >> + .link_speed = ETH_SPEED_NUM_10G, >> + .link_duplex = ETH_LINK_FULL_DUPLEX, >> + .link_status = ETH_LINK_DOWN, >> + .link_autoneg = ETH_LINK_AUTONEG >> +}; >> + >> +static inline int >> +reserve_fill_queue(struct xsk_umem_info *umem, int reserve_size) >> +{ >> + struct xsk_ring_prod *fq = &umem->fq; >> + uint32_t idx; >> + void *addr = NULL; >> + int i, ret; >> + >> + ret = xsk_ring_prod__reserve(fq, reserve_size, &idx); >> + if (!ret) { > >You could use unlikely() here as it is called in the hot path. Got it. > >> + AF_XDP_LOG(ERR, "Failed to reserve enough fq descs.\n"); >> + return ret; >> + } >> + >> + for (i = 0; i < reserve_size; i++) { >> + __u64 *fq_addr; > >For consistency, you could either declare addr here, or fq_addr at the >top of the function. ok, will make the code consistent in next version. > >> + if (rte_ring_dequeue(umem->buf_ring, &addr)) { >> + i--; >> + break; >> + } >> + fq_addr = xsk_ring_prod__fill_addr(fq, idx++); >> + *fq_addr = (uint64_t)addr; >> + } >> + >> + xsk_ring_prod__submit(fq, i); >> + >> + return 0; >> +} >> + >> +static uint16_t >> +eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) >> +{ >> + struct pkt_rx_queue *rxq = queue; >> + struct xsk_ring_cons *rx = &rxq->rx; >> + struct xsk_umem_info *umem = rxq->umem; >> + struct xsk_ring_prod *fq = &umem->fq; >> + uint32_t idx_rx; >> + uint32_t free_thresh = fq->size >> 1; >> + struct rte_mbuf *mbufs[ETH_AF_XDP_TX_BATCH_SIZE]; >> + unsigned long dropped = 0; >> + unsigned long rx_bytes = 0; >> + uint16_t count = 0; >> + int rcvd, i; >> + >> + nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE); >> + >> + rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx); >> + if (rcvd == 0) >> + return 0; >> + >> + if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh) >> + (void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE); >> + >> + if (rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, rcvd) != 0) >unlikely()? Got it. >> + return 0; >> + >> + for (i = 0; i < rcvd; i++) { >> + const struct xdp_desc *desc; >> + uint64_t addr; >> + uint32_t len; >> + void *pkt; >> + >> + desc = xsk_ring_cons__rx_desc(rx, idx_rx++); >> + addr = desc->addr; >> + len = desc->len; >> + pkt = xsk_umem__get_data(rxq->umem->buffer, addr); >> + >> + rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len); >> + rte_pktmbuf_pkt_len(mbufs[i]) = len; >> + rte_pktmbuf_data_len(mbufs[i]) = len; >> + rx_bytes += len; >> + bufs[count++] = mbufs[i]; >> + >> + rte_ring_enqueue(umem->buf_ring, (void *)addr); >> + } >> + >> + xsk_ring_cons__release(rx, rcvd); >> + >> + /* statistics */ >> + rxq->stats.rx_pkts += (rcvd - dropped); >> + rxq->stats.rx_bytes += rx_bytes; >> + >> + return count; >> +} >> + > >... > >> + >> +/* This function gets called when the current port gets stopped. */ >> +static void >> +eth_dev_stop(struct rte_eth_dev *dev) >> +{ >> + dev->data->dev_link.link_status = ETH_LINK_DOWN; >> +} >> + >> +static int >> +eth_dev_configure(struct rte_eth_dev *dev __rte_unused) > >Remove __rte_unused. oops, my bad, will remove it. > >> +{ >> + /* rx/tx must be paired */ >> + if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) >> + return -EINVAL; >> + >> + return 0; >> +} >> + > >... > >> + >> +static int >> +xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq, >> + int ring_size) >> +{ >> + struct xsk_socket_config cfg; >> + struct pkt_tx_queue *txq = rxq->pair; >> + int ret = 0; >> + int reserve_size; >> + >> + rxq->umem = xdp_umem_configure(); >> + if (rxq->umem == NULL) { >> + ret = -ENOMEM; >> + goto err; >You can return directly here as umem == NULL. Got it. > >> + } >> + >> + cfg.rx_size = ring_size; >> + cfg.tx_size = ring_size; >> + cfg.libbpf_flags = 0; >> + cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; >> + cfg.bind_flags = 0; >> + ret = xsk_socket__create(&rxq->xsk, internals->if_name, >> + internals->queue_idx, rxq->umem->umem, &rxq->rx, >> + &txq->tx, &cfg); >> + if (ret) { >> + AF_XDP_LOG(ERR, "Failed to create xsk socket.\n"); >> + goto err; >> + } >> + >> + reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS / 2; >> + ret = reserve_fill_queue(rxq->umem, reserve_size); >> + if (ret) { >> + AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n"); >> + goto err; > >Shouldn't you call xsk_socket__delete(rxq->xsk) here? Good catch, xsk socket does need to be deleted here. > >> + } >> + >> + return 0; >> + >> +err: >> + xdp_umem_destroy(rxq->umem); >> + >> + return ret; >> +} >> + >> +static void >> +queue_reset(struct pmd_internals *internals, uint16_t queue_idx) >> +{ >> + struct pkt_rx_queue *rxq = &internals->rx_queues[queue_idx]; >> + struct pkt_tx_queue *txq = rxq->pair; >> + int xsk_fd = xsk_socket__fd(rxq->xsk); >> + >> + if (xsk_fd) { >> + close(xsk_fd); >> + if (internals->umem != NULL) { > >Moving this condition out would work and be cleaner. > >Anyway, it seems to never enter this condition as internal->umem is not >set yet when queue_reset() is called. You are right, internal->umem and xsk_fd shouldn't be handled here, will refine the code. > >> + xdp_umem_destroy(internals->umem); >> + internals->umem = NULL; >> + } >> + } >> + memset(rxq, 0, sizeof(*rxq)); >> + memset(txq, 0, sizeof(*txq)); >> + rxq->pair = txq; >> + txq->pair = rxq; >> + rxq->queue_idx = queue_idx; >> + txq->queue_idx = queue_idx; >> +} >> + >> +static int >> +eth_rx_queue_setup(struct rte_eth_dev *dev, >> + uint16_t rx_queue_id, >> + uint16_t nb_rx_desc, >> + unsigned int socket_id __rte_unused, >> + const struct rte_eth_rxconf *rx_conf __rte_unused, >> + struct rte_mempool *mb_pool) >> +{ >> + struct pmd_internals *internals = dev->data->dev_private; >> + uint32_t buf_size, data_size; >> + struct pkt_rx_queue *rxq; >> + int ret; >> + >> + rxq = &internals->rx_queues[rx_queue_id]; >> + queue_reset(internals, rx_queue_id); >> + >> + /* Now get the space available for data in the mbuf */ >> + buf_size = rte_pktmbuf_data_room_size(mb_pool) - >> + RTE_PKTMBUF_HEADROOM; >> + data_size = ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM; >> + >> + if (data_size > buf_size) { >> + AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n", >> + dev->device->name, data_size, buf_size); >> + ret = -ENOMEM; >> + goto err; >> + } >> + >> + rxq->mb_pool = mb_pool; >> + >> + if (xsk_configure(internals, rxq, nb_rx_desc)) { >> + AF_XDP_LOG(ERR, "Failed to configure xdp socket\n"); >> + ret = -EINVAL; >> + goto err; >> + } >> + >> + internals->umem = rxq->umem; > >If my previous comment is wrong, i.e. internals->umem may be already set >when queue_reset() is called, then it means you might have a leak here. > >> + >> + dev->data->rx_queues[rx_queue_id] = rxq; >> + return 0; >> + >> +err: >> + queue_reset(internals, rx_queue_id); >> + return ret; >> +} >> +