From: Qi Zhang <qi.z.zhang@intel.com>
To: dev@dpdk.org
Cc: magnus.karlsson@intel.com, bjorn.topel@intel.com,
Qi Zhang <qi.z.zhang@intel.com>
Subject: [dpdk-dev] [RFC v2 1/7] net/af_xdp: new PMD driver
Date: Thu, 8 Mar 2018 21:52:43 +0800 [thread overview]
Message-ID: <20180308135249.28187-2-qi.z.zhang@intel.com> (raw)
In-Reply-To: <20180308135249.28187-1-qi.z.zhang@intel.com>
This is the vanilla version.
Packet data will copy between af_xdp memory buffer and mbuf mempool.
indexes of memory buffer is simply managed by a fifo ring.
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
config/common_base | 5 +
config/common_linuxapp | 1 +
drivers/net/Makefile | 1 +
drivers/net/af_xdp/Makefile | 26 +
drivers/net/af_xdp/meson.build | 7 +
drivers/net/af_xdp/rte_eth_af_xdp.c | 760 ++++++++++++++++++++++++++
drivers/net/af_xdp/rte_pmd_af_xdp_version.map | 4 +
drivers/net/af_xdp/xdpsock_queue.h | 66 +++
mk/rte.app.mk | 1 +
9 files changed, 871 insertions(+)
create mode 100644 drivers/net/af_xdp/Makefile
create mode 100644 drivers/net/af_xdp/meson.build
create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
create mode 100644 drivers/net/af_xdp/xdpsock_queue.h
diff --git a/config/common_base b/config/common_base
index ad03cf433..84b7b3b7e 100644
--- a/config/common_base
+++ b/config/common_base
@@ -368,6 +368,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
#
+# Compile software PMD backed by AF_XDP sockets (Linux only)
+#
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
+
+#
# Compile link bonding PMD library
#
CONFIG_RTE_LIBRTE_PMD_BOND=y
diff --git a/config/common_linuxapp b/config/common_linuxapp
index ff98f2355..3b10695b6 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -16,6 +16,7 @@ CONFIG_RTE_LIBRTE_VHOST=y
CONFIG_RTE_LIBRTE_VHOST_NUMA=y
CONFIG_RTE_LIBRTE_PMD_VHOST=y
CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=y
CONFIG_RTE_LIBRTE_PMD_TAP=y
CONFIG_RTE_LIBRTE_AVP_PMD=y
CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD=y
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index e1127326b..409234ac3 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -9,6 +9,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
endif
DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf
DIRS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += avp
diff --git a/drivers/net/af_xdp/Makefile b/drivers/net/af_xdp/Makefile
new file mode 100644
index 000000000..990073655
--- /dev/null
+++ b/drivers/net/af_xdp/Makefile
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_af_xdp.a
+
+EXPORT_MAP := rte_pmd_af_xdp_version.map
+
+LIBABIVER := 1
+
+CFLAGS += -O3 -I/opt/af_xdp/linux_headers/include
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_vdev
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/af_xdp/meson.build b/drivers/net/af_xdp/meson.build
new file mode 100644
index 000000000..4b6652685
--- /dev/null
+++ b/drivers/net/af_xdp/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+if host_machine.system() != 'linux'
+ build = false
+endif
+sources = files('rte_eth_af_xdp.c')
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
new file mode 100644
index 000000000..5c7c53aeb
--- /dev/null
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -0,0 +1,760 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_kvargs.h>
+#include <rte_bus_vdev.h>
+
+#include <linux/if_ether.h>
+#include <linux/if_xdp.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include "xdpsock_queue.h"
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define ETH_AF_XDP_IFACE_ARG "iface"
+#define ETH_AF_XDP_QUEUE_IDX_ARG "queue"
+#define ETH_AF_XDP_RING_SIZE_ARG "ringsz"
+
+#define ETH_AF_XDP_FRAME_SIZE 2048
+#define ETH_AF_XDP_NUM_BUFFERS 131072
+#define ETH_AF_XDP_DATA_HEADROOM 0
+#define ETH_AF_XDP_DFLT_RING_SIZE 1024
+#define ETH_AF_XDP_DFLT_QUEUE_IDX 0
+
+#define ETH_AF_XDP_RX_BATCH_SIZE 32
+#define ETH_AF_XDP_TX_BATCH_SIZE 32
+
+struct xdp_umem {
+ char *buffer;
+ size_t size;
+ unsigned int frame_size;
+ unsigned int frame_size_log2;
+ unsigned int nframes;
+ int mr_fd;
+};
+
+struct pmd_internals {
+ int sfd;
+ int if_index;
+ char if_name[IFNAMSIZ];
+ struct ether_addr eth_addr;
+ struct xdp_queue rx;
+ struct xdp_queue tx;
+ struct xdp_umem *umem;
+ struct rte_mempool *mb_pool;
+
+ unsigned long rx_pkts;
+ unsigned long rx_bytes;
+ unsigned long rx_dropped;
+
+ unsigned long tx_pkts;
+ unsigned long err_pkts;
+ unsigned long tx_bytes;
+
+ uint16_t port_id;
+ uint16_t queue_idx;
+ int ring_size;
+ struct rte_ring *buf_ring;
+};
+
+static const char * const valid_arguments[] = {
+ ETH_AF_XDP_IFACE_ARG,
+ ETH_AF_XDP_QUEUE_IDX_ARG,
+ ETH_AF_XDP_RING_SIZE_ARG,
+ NULL
+};
+
+static struct rte_eth_link pmd_link = {
+ .link_speed = ETH_SPEED_NUM_10G,
+ .link_duplex = ETH_LINK_FULL_DUPLEX,
+ .link_status = ETH_LINK_DOWN,
+ .link_autoneg = ETH_LINK_AUTONEG
+};
+
+static void *get_pkt_data(struct pmd_internals *internals,
+ uint32_t index,
+ uint32_t offset)
+{
+ return (void *)(internals->umem->buffer +
+ (index << internals->umem->frame_size_log2) + offset);
+}
+
+static uint16_t
+eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct pmd_internals *internals = queue;
+ struct xdp_queue *rxq = &internals->rx;
+ struct rte_mbuf *mbuf;
+ unsigned long dropped = 0;
+ unsigned long rx_bytes = 0;
+ uint16_t count = 0;
+
+ nb_pkts = nb_pkts < ETH_AF_XDP_RX_BATCH_SIZE ?
+ nb_pkts : ETH_AF_XDP_RX_BATCH_SIZE;
+
+ struct xdp_desc descs[ETH_AF_XDP_RX_BATCH_SIZE];
+ void *indexes[ETH_AF_XDP_RX_BATCH_SIZE];
+ int rcvd, i;
+
+ /* fill rx ring */
+ if (rxq->num_free >= ETH_AF_XDP_RX_BATCH_SIZE) {
+ int n = rte_ring_dequeue_bulk(internals->buf_ring,
+ indexes,
+ ETH_AF_XDP_RX_BATCH_SIZE,
+ NULL);
+ for (i = 0; i < n; i++)
+ descs[i].idx = (uint32_t)((long int)indexes[i]);
+ xq_enq(rxq, descs, n);
+ }
+
+ /* read data */
+ rcvd = xq_deq(rxq, descs, nb_pkts);
+ if (rcvd == 0)
+ return 0;
+
+ for (i = 0; i < rcvd; i++) {
+ char *pkt;
+ uint32_t idx = descs[i].idx;
+
+ mbuf = rte_pktmbuf_alloc(internals->mb_pool);
+ rte_pktmbuf_pkt_len(mbuf) =
+ rte_pktmbuf_data_len(mbuf) =
+ descs[i].len;
+ if (mbuf) {
+ pkt = get_pkt_data(internals, idx, descs[i].offset);
+ memcpy(rte_pktmbuf_mtod(mbuf, void *),
+ pkt, descs[i].len);
+ rx_bytes += descs[i].len;
+ bufs[count++] = mbuf;
+ } else {
+ dropped++;
+ }
+ indexes[i] = (void *)((long int)idx);
+ }
+
+ rte_ring_enqueue_bulk(internals->buf_ring, indexes, rcvd, NULL);
+
+ internals->rx_pkts += (rcvd - dropped);
+ internals->rx_bytes += rx_bytes;
+ internals->rx_dropped += dropped;
+
+ return count;
+}
+
+static void kick_tx(int fd)
+{
+ int ret;
+
+ for (;;) {
+ ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+ if (ret >= 0 || errno == ENOBUFS)
+ return;
+ if (errno == EAGAIN)
+ continue;
+ }
+}
+
+static uint16_t
+eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct pmd_internals *internals = queue;
+ struct xdp_queue *txq = &internals->tx;
+ struct rte_mbuf *mbuf;
+ struct xdp_desc descs[ETH_AF_XDP_TX_BATCH_SIZE];
+ void *indexes[ETH_AF_XDP_TX_BATCH_SIZE];
+ uint16_t i, valid;
+ unsigned long tx_bytes = 0;
+
+ nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
+ nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
+
+ if (txq->num_free < ETH_AF_XDP_TX_BATCH_SIZE * 2) {
+ int n = xq_deq(txq, descs, ETH_AF_XDP_TX_BATCH_SIZE);
+
+ for (i = 0; i < n; i++)
+ indexes[i] = (void *)((long int)descs[i].idx);
+ rte_ring_enqueue_bulk(internals->buf_ring, indexes, n, NULL);
+ }
+
+ nb_pkts = nb_pkts > txq->num_free ? txq->num_free : nb_pkts;
+ nb_pkts = rte_ring_dequeue_bulk(internals->buf_ring, indexes,
+ nb_pkts, NULL);
+
+ valid = 0;
+ for (i = 0; i < nb_pkts; i++) {
+ char *pkt;
+ unsigned int buf_len =
+ internals->umem->frame_size - ETH_AF_XDP_DATA_HEADROOM;
+ mbuf = bufs[i];
+ if (mbuf->pkt_len <= buf_len) {
+ descs[valid].idx = (uint32_t)((long int)indexes[valid]);
+ descs[valid].offset = ETH_AF_XDP_DATA_HEADROOM;
+ descs[valid].flags = 0;
+ descs[valid].len = mbuf->pkt_len;
+ pkt = get_pkt_data(internals, descs[i].idx,
+ descs[i].offset);
+ memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
+ descs[i].len);
+ valid++;
+ tx_bytes += mbuf->pkt_len;
+ }
+ /* packet will be consumed anyway */
+ rte_pktmbuf_free(mbuf);
+ }
+
+ xq_enq(txq, descs, valid);
+ kick_tx(internals->sfd);
+
+ if (valid < nb_pkts)
+ rte_ring_enqueue_bulk(internals->buf_ring, &indexes[valid],
+ nb_pkts - valid, NULL);
+
+ internals->err_pkts += (nb_pkts - valid);
+ internals->tx_pkts += valid;
+ internals->tx_bytes += tx_bytes;
+
+ return nb_pkts;
+}
+
+static void
+fill_rx_desc(struct pmd_internals *internals)
+{
+ int num_free = internals->rx.num_free;
+ void *p = NULL;
+ int i;
+
+ for (i = 0; i < num_free; i++) {
+ struct xdp_desc desc = {};
+
+ rte_ring_dequeue(internals->buf_ring, &p);
+ desc.idx = (uint32_t)((long int)p);
+ xq_enq(&internals->rx, &desc, 1);
+ }
+}
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ dev->data->dev_link.link_status = ETH_LINK_UP;
+ fill_rx_desc(internals);
+
+ return 0;
+}
+
+/* This function gets called when the current port gets stopped. */
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+ dev->data->dev_link.link_status = ETH_LINK_DOWN;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+ return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ dev_info->if_index = internals->if_index;
+ dev_info->max_mac_addrs = 1;
+ dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
+ dev_info->max_rx_queues = 1;
+ dev_info->max_tx_queues = 1;
+ dev_info->min_rx_bufsize = 0;
+}
+
+static int
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+ const struct pmd_internals *internals = dev->data->dev_private;
+
+ stats->ipackets = stats->q_ipackets[0] =
+ internals->rx_pkts;
+ stats->ibytes = stats->q_ibytes[0] =
+ internals->rx_bytes;
+ stats->imissed =
+ internals->rx_dropped;
+
+ stats->opackets = stats->q_opackets[0]
+ = internals->tx_pkts;
+ stats->oerrors = stats->q_errors[0] =
+ internals->err_pkts;
+ stats->obytes = stats->q_obytes[0] =
+ internals->tx_bytes;
+
+ return 0;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ internals->rx_pkts = 0;
+ internals->rx_bytes = 0;
+ internals->rx_dropped = 0;
+
+ internals->tx_pkts = 0;
+ internals->err_pkts = 0;
+ internals->tx_bytes = 0;
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev __rte_unused)
+{
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+ int wait_to_complete __rte_unused)
+{
+ return 0;
+}
+
+static struct xdp_umem *xsk_alloc_and_mem_reg_buffers(int sfd, size_t nbuffers)
+{
+ struct xdp_mr_req req = { .frame_size = ETH_AF_XDP_FRAME_SIZE,
+ .data_headroom = ETH_AF_XDP_DATA_HEADROOM };
+ struct xdp_umem *umem;
+ void *bufs;
+ int ret;
+
+ ret = posix_memalign((void **)&bufs, getpagesize(),
+ nbuffers * req.frame_size);
+ if (ret)
+ return NULL;
+
+ umem = calloc(1, sizeof(*umem));
+ if (!umem) {
+ free(bufs);
+ return NULL;
+ }
+
+ req.addr = (unsigned long)bufs;
+ req.len = nbuffers * req.frame_size;
+ ret = setsockopt(sfd, SOL_XDP, XDP_MEM_REG, &req, sizeof(req));
+ RTE_ASSERT(ret == 0);
+
+ umem->frame_size = ETH_AF_XDP_FRAME_SIZE;
+ umem->frame_size_log2 = 11;
+ umem->buffer = bufs;
+ umem->size = nbuffers * req.frame_size;
+ umem->nframes = nbuffers;
+ umem->mr_fd = sfd;
+
+ return umem;
+}
+
+static int
+xdp_configure(struct pmd_internals *internals)
+{
+ struct sockaddr_xdp sxdp;
+ struct xdp_ring_req req;
+ char ring_name[0x100];
+ int ret = 0;
+ long int i;
+
+ snprintf(ring_name, 0x100, "%s_%s_%d", "af_xdp_ring",
+ internals->if_name, internals->queue_idx);
+ internals->buf_ring = rte_ring_create(ring_name,
+ ETH_AF_XDP_NUM_BUFFERS,
+ SOCKET_ID_ANY,
+ 0x0);
+ if (!internals->buf_ring)
+ return -1;
+
+ for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
+ rte_ring_enqueue(internals->buf_ring, (void *)i);
+
+ internals->umem = xsk_alloc_and_mem_reg_buffers(internals->sfd,
+ ETH_AF_XDP_NUM_BUFFERS);
+ if (!internals->umem)
+ goto error;
+
+ req.mr_fd = internals->umem->mr_fd;
+ req.desc_nr = internals->ring_size;
+
+ ret = setsockopt(internals->sfd, SOL_XDP, XDP_RX_RING,
+ &req, sizeof(req));
+
+ RTE_ASSERT(ret == 0);
+
+ ret = setsockopt(internals->sfd, SOL_XDP, XDP_TX_RING,
+ &req, sizeof(req));
+
+ RTE_ASSERT(ret == 0);
+
+ internals->rx.ring = mmap(0, req.desc_nr * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
+ internals->sfd,
+ XDP_PGOFF_RX_RING);
+ RTE_ASSERT(internals->rx.ring != MAP_FAILED);
+
+ internals->rx.num_free = req.desc_nr;
+ internals->rx.ring_mask = req.desc_nr - 1;
+
+ internals->tx.ring = mmap(0, req.desc_nr * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
+ internals->sfd,
+ XDP_PGOFF_TX_RING);
+ RTE_ASSERT(internals->tx.ring != MAP_FAILED);
+
+ internals->tx.num_free = req.desc_nr;
+ internals->tx.ring_mask = req.desc_nr - 1;
+
+ sxdp.sxdp_family = PF_XDP;
+ sxdp.sxdp_ifindex = internals->if_index;
+ sxdp.sxdp_queue_id = internals->queue_idx;
+
+ ret = bind(internals->sfd, (struct sockaddr *)&sxdp, sizeof(sxdp));
+ RTE_ASSERT(ret == 0);
+
+ return ret;
+error:
+ rte_ring_free(internals->buf_ring);
+ internals->buf_ring = NULL;
+ return -1;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,
+ uint16_t rx_queue_id,
+ uint16_t nb_rx_desc __rte_unused,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_rxconf *rx_conf __rte_unused,
+ struct rte_mempool *mb_pool)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ unsigned int buf_size, data_size;
+
+ RTE_ASSERT(rx_queue_id == 0);
+ internals->mb_pool = mb_pool;
+ xdp_configure(internals);
+
+ /* Now get the space available for data in the mbuf */
+ buf_size = rte_pktmbuf_data_room_size(internals->mb_pool) -
+ RTE_PKTMBUF_HEADROOM;
+ data_size = internals->umem->frame_size;
+
+ if (data_size > buf_size) {
+ RTE_LOG(ERR, PMD,
+ "%s: %d bytes will not fit in mbuf (%d bytes)\n",
+ dev->device->name, data_size, buf_size);
+ return -ENOMEM;
+ }
+
+ dev->data->rx_queues[rx_queue_id] = internals;
+ return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev,
+ uint16_t tx_queue_id,
+ uint16_t nb_tx_desc __rte_unused,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ RTE_ASSERT(tx_queue_id == 0);
+ dev->data->tx_queues[tx_queue_id] = internals;
+ return 0;
+}
+
+static int
+eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ struct ifreq ifr = { .ifr_mtu = mtu };
+ int ret;
+ int s;
+
+ s = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s < 0)
+ return -EINVAL;
+
+ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);
+ ret = ioctl(s, SIOCSIFMTU, &ifr);
+ close(s);
+
+ if (ret < 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
+{
+ struct ifreq ifr;
+ int s;
+
+ s = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s < 0)
+ return;
+
+ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);
+ if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
+ goto out;
+ ifr.ifr_flags &= mask;
+ ifr.ifr_flags |= flags;
+ if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
+ goto out;
+out:
+ close(s);
+}
+
+static void
+eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
+}
+
+static void
+eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
+}
+
+static const struct eth_dev_ops ops = {
+ .dev_start = eth_dev_start,
+ .dev_stop = eth_dev_stop,
+ .dev_close = eth_dev_close,
+ .dev_configure = eth_dev_configure,
+ .dev_infos_get = eth_dev_info,
+ .mtu_set = eth_dev_mtu_set,
+ .promiscuous_enable = eth_dev_promiscuous_enable,
+ .promiscuous_disable = eth_dev_promiscuous_disable,
+ .rx_queue_setup = eth_rx_queue_setup,
+ .tx_queue_setup = eth_tx_queue_setup,
+ .rx_queue_release = eth_queue_release,
+ .tx_queue_release = eth_queue_release,
+ .link_update = eth_link_update,
+ .stats_get = eth_stats_get,
+ .stats_reset = eth_stats_reset,
+};
+
+static struct rte_vdev_driver pmd_af_xdp_drv;
+
+static void
+parse_parameters(struct rte_kvargs *kvlist,
+ char **if_name,
+ int *queue_idx,
+ int *ring_size)
+{
+ struct rte_kvargs_pair *pair = NULL;
+ unsigned int k_idx;
+
+ for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
+ pair = &kvlist->pairs[k_idx];
+ if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))
+ *if_name = pair->value;
+ else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
+ *queue_idx = atoi(pair->value);
+ else if (strstr(pair->key, ETH_AF_XDP_RING_SIZE_ARG))
+ *ring_size = atoi(pair->value);
+ }
+}
+
+static int
+get_iface_info(const char *if_name,
+ struct ether_addr *eth_addr,
+ int *if_index)
+{
+ struct ifreq ifr;
+ int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+
+ if (sock < 0)
+ return -1;
+
+ strcpy(ifr.ifr_name, if_name);
+ if (ioctl(sock, SIOCGIFINDEX, &ifr))
+ goto error;
+ *if_index = ifr.ifr_ifindex;
+
+ if (ioctl(sock, SIOCGIFHWADDR, &ifr))
+ goto error;
+
+ memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);
+
+ close(sock);
+ return 0;
+
+error:
+ close(sock);
+ return -1;
+}
+
+static int
+init_internals(struct rte_vdev_device *dev,
+ const char *if_name,
+ int queue_idx,
+ int ring_size)
+{
+ const char *name = rte_vdev_device_name(dev);
+ struct rte_eth_dev *eth_dev = NULL;
+ struct rte_eth_dev_data *data = NULL;
+ const unsigned int numa_node = dev->device.numa_node;
+ struct pmd_internals *internals = NULL;
+ int ret;
+
+ data = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+ if (!data)
+ return -1;
+
+ internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+ if (!internals)
+ goto error_1;
+
+ internals->queue_idx = queue_idx;
+ internals->ring_size = ring_size;
+ strcpy(internals->if_name, if_name);
+ internals->sfd = socket(PF_XDP, SOCK_RAW, 0);
+ if (internals->sfd < 0)
+ goto error_2;
+
+ ret = get_iface_info(if_name, &internals->eth_addr,
+ &internals->if_index);
+ if (ret)
+ goto error_3;
+
+ eth_dev = rte_eth_vdev_allocate(dev, 0);
+ if (!eth_dev)
+ goto error_3;
+
+ rte_memcpy(data, eth_dev->data, sizeof(*data));
+ internals->port_id = eth_dev->data->port_id;
+ data->dev_private = internals;
+ data->nb_rx_queues = 1;
+ data->nb_tx_queues = 1;
+ data->dev_link = pmd_link;
+ data->mac_addrs = &internals->eth_addr;
+
+ eth_dev->data = data;
+ eth_dev->dev_ops = &ops;
+
+ eth_dev->rx_pkt_burst = eth_af_xdp_rx;
+ eth_dev->tx_pkt_burst = eth_af_xdp_tx;
+
+ return 0;
+
+error_3:
+ close(internals->sfd);
+
+error_2:
+ rte_free(internals);
+
+error_1:
+ rte_free(data);
+ return -1;
+}
+
+static int
+rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
+{
+ struct rte_kvargs *kvlist;
+ char *if_name = NULL;
+ int ring_size = ETH_AF_XDP_DFLT_RING_SIZE;
+ int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
+ int ret;
+
+ RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
+ rte_vdev_device_name(dev));
+
+ kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+ if (!kvlist) {
+ RTE_LOG(ERR, PMD,
+ "Invalid kvargs");
+ return -1;
+ }
+
+ if (dev->device.numa_node == SOCKET_ID_ANY)
+ dev->device.numa_node = rte_socket_id();
+
+ parse_parameters(kvlist, &if_name, &queue_idx, &ring_size);
+
+ ret = init_internals(dev, if_name, queue_idx, ring_size);
+ rte_kvargs_free(kvlist);
+
+ return ret;
+}
+
+static int
+rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
+{
+ struct rte_eth_dev *eth_dev = NULL;
+ struct pmd_internals *internals;
+
+ RTE_LOG(INFO, PMD, "Closing AF_XDP ethdev on numa socket %u\n",
+ rte_socket_id());
+
+ if (!dev)
+ return -1;
+
+ /* find the ethdev entry */
+ eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
+ if (!eth_dev)
+ return -1;
+
+ internals = eth_dev->data->dev_private;
+ rte_ring_free(internals->buf_ring);
+ rte_free(internals->umem);
+ rte_free(eth_dev->data->dev_private);
+ rte_free(eth_dev->data);
+ close(internals->sfd);
+
+ rte_eth_dev_release_port(eth_dev);
+
+ return 0;
+}
+
+static struct rte_vdev_driver pmd_af_xdp_drv = {
+ .probe = rte_pmd_af_xdp_probe,
+ .remove = rte_pmd_af_xdp_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
+RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);
+RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
+ "iface=<string> "
+ "queue=<int> "
+ "ringsz=<int> ");
diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
new file mode 100644
index 000000000..ef3539840
--- /dev/null
+++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
@@ -0,0 +1,4 @@
+DPDK_2.0 {
+
+ local: *;
+};
diff --git a/drivers/net/af_xdp/xdpsock_queue.h b/drivers/net/af_xdp/xdpsock_queue.h
new file mode 100644
index 000000000..c5d0cb56a
--- /dev/null
+++ b/drivers/net/af_xdp/xdpsock_queue.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef __XDPSOCK_QUEUE_H
+#define __XDPSOCK_QUEUE_H
+
+static inline int xq_enq(struct xdp_queue *q,
+ const struct xdp_desc *descs,
+ unsigned int ndescs)
+{
+ unsigned int avail_idx = q->avail_idx;
+ unsigned int i;
+ int j;
+
+ if (q->num_free < ndescs)
+ return -ENOSPC;
+
+ q->num_free -= ndescs;
+
+ for (i = 0; i < ndescs; i++) {
+ unsigned int idx = avail_idx++ & q->ring_mask;
+
+ q->ring[idx].idx = descs[i].idx;
+ q->ring[idx].len = descs[i].len;
+ q->ring[idx].offset = descs[i].offset;
+ q->ring[idx].error = 0;
+ }
+ rte_smp_wmb();
+
+ for (j = ndescs - 1; j >= 0; j--) {
+ unsigned int idx = (q->avail_idx + j) & q->ring_mask;
+
+ q->ring[idx].flags = descs[j].flags | XDP_DESC_KERNEL;
+ }
+ q->avail_idx += ndescs;
+
+ return 0;
+}
+
+static inline int xq_deq(struct xdp_queue *q,
+ struct xdp_desc *descs,
+ int ndescs)
+{
+ unsigned int idx, last_used_idx = q->last_used_idx;
+ int i, entries = 0;
+
+ for (i = 0; i < ndescs; i++) {
+ idx = (last_used_idx++) & q->ring_mask;
+ if (q->ring[idx].flags & XDP_DESC_KERNEL)
+ break;
+ entries++;
+ }
+ q->num_free += entries;
+
+ rte_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = q->last_used_idx++ & q->ring_mask;
+ descs[i] = q->ring[idx];
+ }
+
+ return entries;
+}
+
+#endif /* __XDPSOCK_QUEUE_H */
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 3eb41d176..bc26e1457 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -120,6 +120,7 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
_LDLIBS-$(CONFIG_RTE_DRIVER_MEMPOOL_STACK) += -lrte_mempool_stack
_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += -lrte_pmd_af_packet
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += -lrte_pmd_af_xdp
_LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += -lrte_pmd_ark
_LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += -lrte_pmd_avf
_LDLIBS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += -lrte_pmd_avp
--
2.13.6
next prev parent reply other threads:[~2018-03-08 13:52 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-03-08 13:52 [dpdk-dev] [RFC v2 0/7] PMD driver for AF_XDP Qi Zhang
2018-03-08 13:52 ` Qi Zhang [this message]
2018-03-08 13:52 ` [dpdk-dev] [RFC v2 2/7] lib/mbuf: enable parse flags when create mempool Qi Zhang
2018-03-08 13:52 ` [dpdk-dev] [RFC v2 3/7] lib/mempool: allow page size aligned mempool Qi Zhang
2018-03-08 13:52 ` [dpdk-dev] [RFC v2 4/7] net/af_xdp: use mbuf mempool for buffer management Qi Zhang
2018-03-08 13:52 ` [dpdk-dev] [RFC v2 5/7] net/af_xdp: enable share mempool Qi Zhang
2018-03-08 13:52 ` [dpdk-dev] [RFC v2 6/7] net/af_xdp: load BPF file Qi Zhang
2018-03-08 14:20 ` Zhang, Qi Z
2018-03-08 23:15 ` Stephen Hemminger
2018-05-09 7:02 ` Björn Töpel
2018-03-08 13:52 ` [dpdk-dev] [RFC v2 7/7] app/testpmd: enable parameter for mempool flags Qi Zhang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180308135249.28187-2-qi.z.zhang@intel.com \
--to=qi.z.zhang@intel.com \
--cc=bjorn.topel@intel.com \
--cc=dev@dpdk.org \
--cc=magnus.karlsson@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).