* [dpdk-dev] [RFC v3 1/6] net/af_xdp: new PMD driver
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
@ 2018-08-16 14:43 ` Qi Zhang
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 2/6] lib/mbuf: enable parse flags when create mempool Qi Zhang
` (6 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Qi Zhang @ 2018-08-16 14:43 UTC (permalink / raw)
To: dev
Cc: magnus.karlsson, bjorn.topel, jingjing.wu, xiaoyun.li,
ferruh.yigit, Qi Zhang
Add a new PMD driver for AF_XDP which is a proposed
faster version of AF_PACKET interface in Linux.
https://fosdem.org/2018/schedule/event/af_xdp/
https://lwn.net/Articles/745934/
This patch enable the vanilla version.
Packet data will copy between xdp socket's memory buffer and
rx queue's mbuf mempool, also memory allocation of xdp socket's memory
buffer is simply managed by a fifo ring.
Further improvement will be covered in following patches.
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
config/common_base | 5 +
config/common_linuxapp | 1 +
drivers/net/Makefile | 1 +
drivers/net/af_xdp/Makefile | 30 +
drivers/net/af_xdp/meson.build | 7 +
drivers/net/af_xdp/rte_eth_af_xdp.c | 1247 +++++++++++++++++++++++++
drivers/net/af_xdp/rte_pmd_af_xdp_version.map | 4 +
mk/rte.app.mk | 1 +
8 files changed, 1296 insertions(+)
create mode 100644 drivers/net/af_xdp/Makefile
create mode 100644 drivers/net/af_xdp/meson.build
create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
diff --git a/config/common_base b/config/common_base
index 4bcbaf923..81aa81754 100644
--- a/config/common_base
+++ b/config/common_base
@@ -383,6 +383,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
#
+# Compile software PMD backed by AF_XDP sockets (Linux only)
+#
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
+
+#
# Compile link bonding PMD library
#
CONFIG_RTE_LIBRTE_PMD_BOND=y
diff --git a/config/common_linuxapp b/config/common_linuxapp
index 9c5ea9d89..5fa1cfb87 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -18,6 +18,7 @@ CONFIG_RTE_LIBRTE_PMD_VHOST=y
CONFIG_RTE_LIBRTE_IFC_PMD=y
CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
CONFIG_RTE_LIBRTE_PMD_SOFTNIC=y
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=y
CONFIG_RTE_LIBRTE_PMD_TAP=y
CONFIG_RTE_LIBRTE_AVP_PMD=y
CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD=y
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 664398de9..7cff65c45 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -9,6 +9,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
endif
DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf
DIRS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += avp
diff --git a/drivers/net/af_xdp/Makefile b/drivers/net/af_xdp/Makefile
new file mode 100644
index 000000000..8dee0144a
--- /dev/null
+++ b/drivers/net/af_xdp/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_af_xdp.a
+
+EXPORT_MAP := rte_pmd_af_xdp_version.map
+
+LIBABIVER := 1
+
+
+CFLAGS += -O3
+# below line should be removed
+CFLAGS += -I/home/qzhan15/bpf/usr/include
+
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_vdev
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/af_xdp/meson.build b/drivers/net/af_xdp/meson.build
new file mode 100644
index 000000000..4b6652685
--- /dev/null
+++ b/drivers/net/af_xdp/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+if host_machine.system() != 'linux'
+ build = false
+endif
+sources = files('rte_eth_af_xdp.c')
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
new file mode 100644
index 000000000..12252014d
--- /dev/null
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -0,0 +1,1247 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_kvargs.h>
+#include <rte_bus_vdev.h>
+
+#include <linux/if_ether.h>
+#include <linux/if_xdp.h>
+#include <linux/if_link.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include <bpf/bpf.h>
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define ETH_AF_XDP_IFACE_ARG "iface"
+#define ETH_AF_XDP_QUEUE_IDX_ARG "queue"
+#define ETH_AF_XDP_XSK_MAP_ID_ARG "xsk_map_id"
+#define ETH_AF_XDP_XSK_MAP_KEY_START_ARG "xsk_map_key_start"
+#define ETH_AF_XDP_XSK_MAP_KEY_COUNT_ARG "xsk_map_key_count"
+
+#define ETH_AF_XDP_FRAME_SIZE 2048
+#define ETH_AF_XDP_NUM_BUFFERS 4096
+#define ETH_AF_XDP_DATA_HEADROOM 0
+#define ETH_AF_XDP_DFLT_NUM_DESCS 1024
+#define ETH_AF_XDP_FQ_NUM_DESCS 1024
+#define ETH_AF_XDP_CQ_NUM_DESCS 1024
+#define ETH_AF_XDP_DFLT_QUEUE_IDX 0
+
+#define ETH_AF_XDP_RX_BATCH_SIZE 16
+#define ETH_AF_XDP_TX_BATCH_SIZE 16
+
+#define ETH_AF_XDP_MAX_QUEUE_PAIRS 16
+
+struct xdp_umem_uqueue {
+ uint32_t cached_prod;
+ uint32_t cached_cons;
+ uint32_t mask;
+ uint32_t size;
+ uint32_t *producer;
+ uint32_t *consumer;
+ uint64_t *ring;
+ void *map;
+};
+
+struct xdp_umem {
+ char *frames;
+ struct xdp_umem_uqueue fq;
+ struct xdp_umem_uqueue cq;
+ struct rte_ring *buf_ring; /* be used to manage the buffer */
+ int fd;
+};
+
+struct xdp_uqueue {
+ uint32_t cached_prod;
+ uint32_t cached_cons;
+ uint32_t mask;
+ uint32_t size;
+ uint32_t *producer;
+ uint32_t *consumer;
+ struct xdp_desc *ring;
+ void *map;
+};
+
+static inline uint32_t xq_nb_avail(struct xdp_uqueue *q, uint32_t ndescs)
+{
+ uint32_t entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+
+ return (entries > ndescs) ? ndescs : entries;
+}
+
+static inline uint32_t xq_nb_free(struct xdp_uqueue *q, uint32_t ndescs)
+{
+ uint32_t free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= ndescs)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline uint32_t umem_nb_avail(struct xdp_umem_uqueue *q, uint32_t nb)
+{
+ uint32_t entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+ return (entries > nb) ? nb : entries;
+}
+
+static inline uint32_t umem_nb_free(struct xdp_umem_uqueue *q, uint32_t nb)
+{
+ uint32_t free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= nb)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
+ struct xdp_desc *d,
+ size_t nb)
+{
+ uint32_t i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ uint32_t idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i].addr;
+ }
+
+ rte_smp_wmb();
+
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq,
+ uint64_t *d,
+ size_t nb)
+{
+ uint32_t i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ uint32_t idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i];
+ }
+
+ rte_smp_wmb();
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
+ uint64_t *d, size_t nb)
+{
+ uint32_t idx, i, entries = umem_nb_avail(cq, nb);
+
+ rte_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = cq->cached_cons++ & cq->mask;
+ d[i] = cq->ring[idx];
+ }
+
+ if (entries > 0) {
+ rte_smp_wmb();
+ *cq->consumer = cq->cached_cons;
+ }
+
+ return entries;
+}
+
+static inline int xq_enq(struct xdp_uqueue *uq,
+ const struct xdp_desc *descs,
+ unsigned int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int i;
+
+ if (xq_nb_free(uq, ndescs) < ndescs)
+ return -ENOSPC;
+
+ for (i = 0; i < ndescs; i++) {
+ uint32_t idx = uq->cached_prod++ & uq->mask;
+
+ r[idx].addr = descs[i].addr;
+ r[idx].len = descs[i].len;
+ }
+
+ rte_smp_wmb();
+
+ *uq->producer = uq->cached_prod;
+ return 0;
+}
+
+static inline int xq_deq(struct xdp_uqueue *uq,
+ struct xdp_desc *descs,
+ int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int idx;
+ int i, entries;
+
+ entries = xq_nb_avail(uq, ndescs);
+ rte_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = uq->cached_cons++ & uq->mask;
+ descs[i] = r[idx];
+ }
+
+ if (entries > 0) {
+ rte_smp_wmb();
+
+ *uq->consumer = uq->cached_cons;
+ }
+
+ return entries;
+}
+
+struct pkt_rx_queue {
+ int xsk_fd;
+ uint16_t queue_idx;
+ struct xdp_uqueue rx;
+ struct xdp_umem *umem;
+ struct rte_mempool *mb_pool;
+
+ unsigned long rx_pkts;
+ unsigned long rx_bytes;
+ unsigned long rx_dropped;
+
+ struct pkt_tx_queue *pair;
+};
+
+struct pkt_tx_queue {
+ uint16_t queue_idx;
+ struct xdp_uqueue tx;
+
+ unsigned long tx_pkts;
+ unsigned long err_pkts;
+ unsigned long tx_bytes;
+
+ struct pkt_rx_queue *pair;
+};
+
+struct pmd_internals {
+ int if_index;
+ char if_name[IFNAMSIZ];
+ uint16_t queue_idx;
+ struct ether_addr eth_addr;
+ struct xdp_umem *umem_share;
+ int umem_share_count;
+ struct rte_mempool *mb_pool_share;
+ int xsk_map_id;
+ int xsk_map_key_start;
+ int xsk_map_key_count;
+
+ struct pkt_rx_queue rx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS];
+ struct pkt_tx_queue tx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS];
+};
+
+static const char * const valid_arguments[] = {
+ ETH_AF_XDP_IFACE_ARG,
+ ETH_AF_XDP_QUEUE_IDX_ARG,
+ ETH_AF_XDP_XSK_MAP_ID_ARG,
+ ETH_AF_XDP_XSK_MAP_KEY_START_ARG,
+ ETH_AF_XDP_XSK_MAP_KEY_COUNT_ARG,
+ NULL
+};
+
+static struct rte_eth_link pmd_link = {
+ .link_speed = ETH_SPEED_NUM_10G,
+ .link_duplex = ETH_LINK_FULL_DUPLEX,
+ .link_status = ETH_LINK_DOWN,
+ .link_autoneg = ETH_LINK_AUTONEG
+};
+
+static char *get_pkt_data(struct xdp_umem *umem, uint64_t addr)
+{
+ return &umem->frames[addr];
+}
+
+static uint16_t
+eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct xdp_desc descs[ETH_AF_XDP_RX_BATCH_SIZE];
+ void *addrs[ETH_AF_XDP_RX_BATCH_SIZE];
+ struct pkt_rx_queue *rxq = queue;
+ struct xdp_uqueue *uq = &rxq->rx;
+ struct xdp_umem_uqueue *fq = &rxq->umem->fq;
+ uint32_t free_thresh = fq->size >> 1;
+ struct rte_mbuf *mbuf;
+ unsigned long dropped = 0;
+ unsigned long rx_bytes = 0;
+ uint16_t count = 0;
+ int rcvd, i;
+
+ nb_pkts = nb_pkts < ETH_AF_XDP_RX_BATCH_SIZE ?
+ nb_pkts : ETH_AF_XDP_RX_BATCH_SIZE;
+
+ if (umem_nb_free(fq, free_thresh) >= free_thresh) {
+ int n = rte_ring_dequeue_bulk(rxq->umem->buf_ring,
+ addrs,
+ ETH_AF_XDP_RX_BATCH_SIZE,
+ NULL);
+ if (n == 0)
+ return -ENOMEM;
+
+ if (umem_fill_to_kernel(fq, (uint64_t *)&addrs[0],
+ ETH_AF_XDP_RX_BATCH_SIZE)) {
+ rte_ring_enqueue_bulk(rxq->umem->buf_ring,
+ addrs,
+ ETH_AF_XDP_RX_BATCH_SIZE,
+ NULL);
+ }
+ }
+
+ /* read data */
+ rcvd = xq_deq(uq, descs, nb_pkts);
+ if (rcvd == 0)
+ return 0;
+
+ for (i = 0; i < rcvd; i++) {
+ char *pkt;
+ uint64_t addr = descs[i].addr;
+
+ mbuf = rte_pktmbuf_alloc(rxq->mb_pool);
+ rte_pktmbuf_pkt_len(mbuf) =
+ rte_pktmbuf_data_len(mbuf) =
+ descs[i].len;
+ if (mbuf) {
+ pkt = get_pkt_data(rxq->umem, addr);
+ memcpy(rte_pktmbuf_mtod(mbuf, void *),
+ pkt, descs[i].len);
+ rx_bytes += descs[i].len;
+ bufs[count++] = mbuf;
+ } else {
+ dropped++;
+ }
+ addrs[i] = (void *)addr;
+ }
+
+ rte_ring_enqueue_bulk(rxq->umem->buf_ring, addrs, rcvd, NULL);
+
+ rxq->rx_pkts += (rcvd - dropped);
+ rxq->rx_bytes += rx_bytes;
+ rxq->rx_dropped += dropped;
+
+ return count;
+}
+
+static void kick_tx(struct pkt_tx_queue *txq)
+{
+ void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
+ struct rte_ring *buf_ring = txq->pair->umem->buf_ring;
+ struct xdp_umem_uqueue *cq = &txq->pair->umem->cq;
+ int fd = txq->pair->xsk_fd;
+ int ret, n;
+
+ while (1) {
+
+ ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+
+ /* everything is ok */
+ if (ret >= 0)
+ break;
+
+ /* some thing unexpected */
+ if (errno != EBUSY && errno != EAGAIN)
+ break;
+
+ /* pull from complete qeueu to leave more space */
+ if (errno == EAGAIN) {
+ n = umem_complete_from_kernel(cq,
+ (uint64_t *)&addrs[0],
+ ETH_AF_XDP_TX_BATCH_SIZE);
+ if (n > 0)
+ rte_ring_enqueue_bulk(buf_ring,
+ addrs, n, NULL);
+ }
+ }
+}
+
+static uint16_t
+eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct pkt_tx_queue *txq = queue;
+ struct xdp_uqueue *uq = &txq->tx;
+ struct xdp_umem_uqueue *cq = &txq->pair->umem->cq;
+ struct rte_mbuf *mbuf;
+ struct xdp_desc descs[ETH_AF_XDP_TX_BATCH_SIZE];
+ void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
+ uint16_t i, valid;
+ unsigned long tx_bytes = 0;
+
+ nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
+ nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
+
+ int n = umem_complete_from_kernel(cq, (uint64_t *)&addrs[0],
+ ETH_AF_XDP_TX_BATCH_SIZE);
+ if (n > 0)
+ rte_ring_enqueue_bulk(txq->pair->umem->buf_ring,
+ addrs, n, NULL);
+
+ nb_pkts = rte_ring_dequeue_bulk(txq->pair->umem->buf_ring, addrs,
+ nb_pkts, NULL);
+ if (!nb_pkts)
+ return 0;
+
+ valid = 0;
+ for (i = 0; i < nb_pkts; i++) {
+ char *pkt;
+ unsigned int buf_len =
+ ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
+ mbuf = bufs[i];
+ if (mbuf->pkt_len <= buf_len) {
+ descs[valid].addr = (uint64_t)addrs[valid];
+ descs[valid].len = mbuf->pkt_len;
+ descs[valid].options = 0;
+ pkt = get_pkt_data(txq->pair->umem, descs[valid].addr);
+ memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
+ descs[i].len);
+ valid++;
+ tx_bytes += mbuf->pkt_len;
+ }
+ rte_pktmbuf_free(mbuf);
+ }
+
+ if (xq_enq(uq, descs, valid)) {
+ valid = 0;
+ tx_bytes = 0;
+ } else {
+ kick_tx(txq);
+ }
+
+ if (valid < nb_pkts)
+ rte_ring_enqueue_bulk(txq->pair->umem->buf_ring, &addrs[valid],
+ nb_pkts - valid, NULL);
+
+ txq->err_pkts += (nb_pkts - valid);
+ txq->tx_pkts += valid;
+ txq->tx_bytes += tx_bytes;
+
+ return nb_pkts;
+}
+
+static void
+fill_rx_desc(struct xdp_umem *umem)
+{
+ struct xdp_umem_uqueue *fq = &umem->fq;
+ void *p = NULL;
+ uint32_t i;
+
+ for (i = 0; i < fq->size / 2; i++) {
+ rte_ring_dequeue(umem->buf_ring, &p);
+ if (umem_fill_to_kernel(fq, (uint64_t *)&p, 1)) {
+ rte_ring_enqueue(umem->buf_ring, p);
+ break;
+ }
+ }
+}
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+ dev->data->dev_link.link_status = ETH_LINK_UP;
+
+ return 0;
+}
+
+/* This function gets called when the current port gets stopped. */
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+ dev->data->dev_link.link_status = ETH_LINK_DOWN;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+ /* rx/tx must be paired */
+ if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ dev_info->if_index = internals->if_index;
+ dev_info->max_mac_addrs = 1;
+ dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
+ dev_info->max_rx_queues = internals->xsk_map_key_count;
+ dev_info->max_tx_queues = internals->xsk_map_key_count;
+ dev_info->min_rx_bufsize = 0;
+
+ dev_info->default_rxportconf.nb_queues = 1;
+ dev_info->default_txportconf.nb_queues = 1;
+ dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
+ dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
+}
+
+static int
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ struct xdp_statistics xdp_stats;
+ struct pkt_rx_queue *rxq;
+ socklen_t optlen;
+ int i;
+
+ optlen = sizeof(struct xdp_statistics);
+ for (i = 0; i < dev->data->nb_rx_queues; i++) {
+ rxq = &internals->rx_queues[i];
+ stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
+ stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
+
+ stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
+ stats->q_errors[i] = internals->tx_queues[i].err_pkts;
+ stats->q_obytes[i] = internals->tx_queues[i].tx_bytes;
+
+ stats->ipackets += stats->q_ipackets[i];
+ stats->ibytes += stats->q_ibytes[i];
+ stats->imissed += internals->rx_queues[i].rx_dropped;
+ getsockopt(rxq->xsk_fd, SOL_XDP, XDP_STATISTICS,
+ &xdp_stats, &optlen);
+ stats->imissed += xdp_stats.rx_dropped;
+
+ stats->opackets += stats->q_opackets[i];
+ stats->oerrors += stats->q_errors[i];
+ stats->obytes += stats->q_obytes[i];
+ }
+
+ return 0;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ int i;
+
+ for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
+ internals->rx_queues[i].rx_pkts = 0;
+ internals->rx_queues[i].rx_bytes = 0;
+ internals->rx_queues[i].rx_dropped = 0;
+
+ internals->tx_queues[i].tx_pkts = 0;
+ internals->tx_queues[i].err_pkts = 0;
+ internals->tx_queues[i].tx_bytes = 0;
+ }
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev __rte_unused)
+{
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+ int wait_to_complete __rte_unused)
+{
+ return 0;
+}
+
+static void xdp_umem_destroy(struct xdp_umem *umem)
+{
+ if (umem->frames)
+ free(umem->frames);
+ if (umem->buf_ring)
+ rte_ring_free(umem->buf_ring);
+
+ free(umem);
+}
+
+static struct xdp_umem *xdp_umem_configure(int sfd)
+{
+ int fq_size = ETH_AF_XDP_FQ_NUM_DESCS;
+ int cq_size = ETH_AF_XDP_CQ_NUM_DESCS;
+ struct xdp_mmap_offsets off;
+ struct xdp_umem_reg mr;
+ struct xdp_umem *umem;
+ char ring_name[0x100];
+ socklen_t optlen;
+ void *bufs = NULL;
+ uint64_t i;
+
+ umem = calloc(1, sizeof(*umem));
+ if (!umem)
+ return NULL;
+
+ snprintf(ring_name, 0x100, "%s_%d", "af_xdp_ring", sfd);
+ umem->buf_ring = rte_ring_create(ring_name,
+ ETH_AF_XDP_NUM_BUFFERS,
+ SOCKET_ID_ANY,
+ 0x0);
+ if (!umem->buf_ring) {
+ RTE_LOG(ERR, PMD,
+ "Failed to create rte_ring\n");
+ goto err;
+ }
+
+ for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
+ rte_ring_enqueue(umem->buf_ring,
+ (void *)(i * ETH_AF_XDP_FRAME_SIZE +
+ ETH_AF_XDP_DATA_HEADROOM));
+
+ if (posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+ ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to allocate memory pool.\n");
+ goto err;
+ }
+
+ mr.addr = (uint64_t)bufs;
+ mr.len = ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE;
+ mr.chunk_size = ETH_AF_XDP_FRAME_SIZE;
+ mr.headroom = ETH_AF_XDP_DATA_HEADROOM;
+
+ if (setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr))) {
+ RTE_LOG(ERR, PMD,
+ "Failed to register memory pool.\n");
+ goto err;
+ }
+
+ if (setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
+ sizeof(int))) {
+ RTE_LOG(ERR, PMD,
+ "Failed to setup fill ring.\n");
+ goto err;
+ }
+
+ if (setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
+ sizeof(int))) {
+ RTE_LOG(ERR, PMD,
+ "Failed to setup complete ring.\n");
+ goto err;
+ }
+
+ optlen = sizeof(off);
+ if (getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to get map fr/cr offset.\n");
+ goto err;
+ }
+
+ umem->fq.map = mmap(0, off.fr.desc +
+ fq_size * sizeof(uint64_t),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_FILL_RING);
+
+ if (umem->fq.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD,
+ "Failed to allocate memory for fq.\n");
+ goto err;
+ }
+
+ umem->fq.mask = fq_size - 1;
+ umem->fq.size = fq_size;
+ umem->fq.producer =
+ (uint32_t *)((uint64_t)umem->fq.map + off.fr.producer);
+ umem->fq.consumer =
+ (uint32_t *)((uint64_t)umem->fq.map + off.fr.consumer);
+ umem->fq.ring = (uint64_t *)((uint64_t)umem->fq.map + off.fr.desc);
+ umem->fq.cached_cons = fq_size;
+
+ umem->cq.map = mmap(0, off.cr.desc +
+ cq_size * sizeof(uint64_t),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_COMPLETION_RING);
+
+ if (umem->cq.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD,
+ "Failed to allocate memory for caq\n");
+ goto err;
+ }
+
+ umem->cq.mask = cq_size - 1;
+ umem->cq.size = cq_size;
+ umem->cq.producer =
+ (uint32_t *)((uint64_t)umem->cq.map + off.cr.producer);
+ umem->cq.consumer =
+ (uint32_t *)((uint64_t)umem->cq.map + off.cr.consumer);
+ umem->cq.ring = (uint64_t *)((uint64_t)umem->cq.map + off.cr.desc);
+
+ umem->frames = bufs;
+ umem->fd = sfd;
+
+ return umem;
+
+err:
+ xdp_umem_destroy(umem);
+ return NULL;
+
+}
+
+static int
+xsk_configure(struct pkt_rx_queue *rxq, int ring_size, struct xdp_umem *umem)
+{
+ struct pkt_tx_queue *txq = rxq->pair;
+ struct xdp_mmap_offsets off;
+ int new_umem = 0;
+ socklen_t optlen;
+
+ rxq->xsk_fd = socket(PF_XDP, SOCK_RAW, 0);
+ if (rxq->xsk_fd < 0)
+ return -1;
+
+ if (!umem) {
+ rxq->umem = xdp_umem_configure(rxq->xsk_fd);
+ if (!rxq->umem)
+ goto err;
+ new_umem = 1;
+ } else {
+ rxq->umem = umem;
+ }
+
+ if (setsockopt(rxq->xsk_fd, SOL_XDP, XDP_RX_RING,
+ &ring_size, sizeof(int))) {
+ RTE_LOG(ERR, PMD, "Failed to setup Rx ring.\n");
+ goto err;
+ }
+
+ if (setsockopt(rxq->xsk_fd, SOL_XDP, XDP_TX_RING,
+ &ring_size, sizeof(int))) {
+ RTE_LOG(ERR, PMD, "Failed to setup Tx ring.\n");
+ goto err;
+ }
+
+ optlen = sizeof(off);
+ if (getsockopt(rxq->xsk_fd, SOL_XDP, XDP_MMAP_OFFSETS,
+ &off, &optlen)) {
+ RTE_LOG(ERR, PMD, "Failed to get map rx/tx offsets.\n");
+ goto err;
+ }
+
+ /* Rx */
+ rxq->rx.map = mmap(NULL,
+ off.rx.desc +
+ ring_size * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, rxq->xsk_fd,
+ XDP_PGOFF_RX_RING);
+
+ if (rxq->rx.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD, "Failed to map Rx ring memory.\n");
+ goto err;
+ }
+
+ fill_rx_desc(rxq->umem);
+ /* Tx */
+ txq->tx.map = mmap(NULL,
+ off.tx.desc +
+ ring_size * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, rxq->xsk_fd,
+ XDP_PGOFF_TX_RING);
+
+ if (txq->tx.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD, "Failed to map Tx ring memory\n");
+ goto err;
+ }
+
+ rxq->rx.mask = ring_size - 1;
+ rxq->rx.size = ring_size;
+ rxq->rx.producer =
+ (uint32_t *)((uint64_t)rxq->rx.map + off.rx.producer);
+ rxq->rx.consumer =
+ (uint32_t *)((uint64_t)rxq->rx.map + off.rx.consumer);
+ rxq->rx.ring = (struct xdp_desc *)((uint64_t)rxq->rx.map + off.rx.desc);
+
+ txq->tx.mask = ring_size - 1;
+ txq->tx.size = ring_size;
+ txq->tx.producer =
+ (uint32_t *)((uint64_t)txq->tx.map + off.tx.producer);
+ txq->tx.consumer =
+ (uint32_t *)((uint64_t)txq->tx.map + off.tx.consumer);
+ txq->tx.ring = (struct xdp_desc *)((uint64_t)txq->tx.map + off.tx.desc);
+ txq->tx.cached_cons = ring_size;
+
+ return 0;
+
+err:
+ if (new_umem)
+ xdp_umem_destroy(rxq->umem);
+ close(rxq->xsk_fd);
+ rxq->xsk_fd = 0;
+
+ return -1;
+}
+
+static void
+queue_reset(struct pmd_internals *internals, uint16_t queue_idx)
+{
+ struct pkt_rx_queue *rxq = &internals->rx_queues[queue_idx];
+ struct pkt_tx_queue *txq = rxq->pair;
+
+ if (rxq->xsk_fd) {
+ close(rxq->xsk_fd);
+ if (internals->umem_share_count > 0) {
+ internals->umem_share_count--;
+ if (internals->umem_share_count == 0 &&
+ internals->umem_share) {
+ xdp_umem_destroy(internals->umem_share);
+ internals->umem_share = NULL;
+ }
+ }
+ }
+ memset(rxq, 0, sizeof(*rxq));
+ memset(txq, 0, sizeof(*txq));
+ rxq->pair = txq;
+ txq->pair = rxq;
+ rxq->queue_idx = queue_idx;
+ txq->queue_idx = queue_idx;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,
+ uint16_t rx_queue_id,
+ uint16_t nb_rx_desc,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_rxconf *rx_conf __rte_unused,
+ struct rte_mempool *mb_pool)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ unsigned int buf_size, data_size;
+ struct pkt_rx_queue *rxq;
+ struct sockaddr_xdp sxdp = {0};
+ int xsk_key;
+ int map_fd;
+
+ if (dev->data->nb_rx_queues <= rx_queue_id) {
+ RTE_LOG(ERR, PMD,
+ "Invalid rx queue id: %d\n", rx_queue_id);
+ return -EINVAL;
+ }
+
+ rxq = &internals->rx_queues[rx_queue_id];
+ queue_reset(internals, rx_queue_id);
+
+ /* Now get the space available for data in the mbuf */
+ buf_size = rte_pktmbuf_data_room_size(mb_pool) -
+ RTE_PKTMBUF_HEADROOM;
+ data_size = ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
+
+ if (data_size > buf_size) {
+ RTE_LOG(ERR, PMD,
+ "%s: %d bytes will not fit in mbuf (%d bytes)\n",
+ dev->device->name, data_size, buf_size);
+ return -ENOMEM;
+ }
+
+ rxq->mb_pool = mb_pool;
+
+ if (xsk_configure(rxq, nb_rx_desc, internals->umem_share)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to configure xdp socket\n");
+ return -EINVAL;
+ }
+
+ sxdp.sxdp_family = PF_XDP;
+ sxdp.sxdp_ifindex = internals->if_index;
+ sxdp.sxdp_queue_id = internals->queue_idx;
+ sxdp.sxdp_flags = 0;
+ if (internals->umem_share) {
+ RTE_LOG(INFO, PMD,
+ "use share umem at queue id %d\n", rx_queue_id);
+ sxdp.sxdp_flags = XDP_SHARED_UMEM;
+ sxdp.sxdp_shared_umem_fd = internals->umem_share->fd;
+ }
+
+ if (bind(rxq->xsk_fd, (struct sockaddr *)&sxdp, sizeof(sxdp))) {
+ RTE_LOG(ERR, PMD, "Failed to bind xdp socket\n");
+ if (!internals->umem_share)
+ xdp_umem_destroy(rxq->umem);
+ goto err;
+ }
+
+ if (!internals->umem_share)
+ internals->umem_share = rxq->umem;
+
+ internals->umem_share_count++;
+ map_fd = bpf_map_get_fd_by_id(internals->xsk_map_id);
+
+ xsk_key = internals->xsk_map_key_start + rx_queue_id;
+ if (bpf_map_update_elem(map_fd, &xsk_key, &rxq->xsk_fd, 0)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to update xsk map\n");
+ goto err;
+ }
+
+ dev->data->rx_queues[rx_queue_id] = rxq;
+ return 0;
+
+err:
+ queue_reset(internals, rx_queue_id);
+ return -EINVAL;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev,
+ uint16_t tx_queue_id,
+ uint16_t nb_tx_desc,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ struct pkt_tx_queue *txq;
+
+ if (dev->data->nb_tx_queues <= tx_queue_id) {
+ RTE_LOG(ERR, PMD, "Invalid tx queue id: %d\n", tx_queue_id);
+ return -EINVAL;
+ }
+
+ RTE_LOG(WARNING, PMD, "Warning tx queue setup size=%d will be skipped\n",
+ nb_tx_desc);
+ txq = &internals->tx_queues[tx_queue_id];
+
+ dev->data->tx_queues[tx_queue_id] = txq;
+ return 0;
+}
+
+static int
+eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ struct ifreq ifr = { .ifr_mtu = mtu };
+ int ret;
+ int s;
+
+ s = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s < 0)
+ return -EINVAL;
+
+ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);
+ ret = ioctl(s, SIOCSIFMTU, &ifr);
+ close(s);
+
+ if (ret < 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
+{
+ struct ifreq ifr;
+ int s;
+
+ s = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s < 0)
+ return;
+
+ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);
+ if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
+ goto out;
+ ifr.ifr_flags &= mask;
+ ifr.ifr_flags |= flags;
+ if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
+ goto out;
+out:
+ close(s);
+}
+
+static void
+eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
+}
+
+static void
+eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
+}
+
+static const struct eth_dev_ops ops = {
+ .dev_start = eth_dev_start,
+ .dev_stop = eth_dev_stop,
+ .dev_close = eth_dev_close,
+ .dev_configure = eth_dev_configure,
+ .dev_infos_get = eth_dev_info,
+ .mtu_set = eth_dev_mtu_set,
+ .promiscuous_enable = eth_dev_promiscuous_enable,
+ .promiscuous_disable = eth_dev_promiscuous_disable,
+ .rx_queue_setup = eth_rx_queue_setup,
+ .tx_queue_setup = eth_tx_queue_setup,
+ .rx_queue_release = eth_queue_release,
+ .tx_queue_release = eth_queue_release,
+ .link_update = eth_link_update,
+ .stats_get = eth_stats_get,
+ .stats_reset = eth_stats_reset,
+};
+
+static struct rte_vdev_driver pmd_af_xdp_drv;
+
+static void
+parse_parameters(struct rte_kvargs *kvlist,
+ char **if_name,
+ int *queue_idx,
+ int *xsk_map_id,
+ int *xsk_map_key_start,
+ int *xsk_map_key_count)
+{
+ struct rte_kvargs_pair *pair = NULL;
+ unsigned int k_idx;
+
+ for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
+ pair = &kvlist->pairs[k_idx];
+ if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))
+ *if_name = pair->value;
+ else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
+ *queue_idx = atoi(pair->value);
+ else if (strstr(pair->key, ETH_AF_XDP_XSK_MAP_ID_ARG))
+ *xsk_map_id = atoi(pair->value);
+ else if (strstr(pair->value, ETH_AF_XDP_XSK_MAP_KEY_START_ARG))
+ *xsk_map_key_start = atoi(pair->value);
+ else if (strstr(pair->key, ETH_AF_XDP_XSK_MAP_KEY_COUNT_ARG))
+ *xsk_map_key_count = atoi(pair->value);
+ }
+}
+
+static int
+get_iface_info(const char *if_name,
+ struct ether_addr *eth_addr,
+ int *if_index)
+{
+ struct ifreq ifr;
+ int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+
+ if (sock < 0)
+ return -1;
+
+ strcpy(ifr.ifr_name, if_name);
+ if (ioctl(sock, SIOCGIFINDEX, &ifr))
+ goto error;
+
+ if (ioctl(sock, SIOCGIFHWADDR, &ifr))
+ goto error;
+
+ memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);
+
+ close(sock);
+ *if_index = if_nametoindex(if_name);
+ return 0;
+
+error:
+ close(sock);
+ return -1;
+}
+
+static int
+init_internals(struct rte_vdev_device *dev,
+ const char *if_name,
+ int queue_idx,
+ int xsk_map_id,
+ int xsk_map_key_start,
+ int xsk_map_key_count)
+{
+ const char *name = rte_vdev_device_name(dev);
+ struct rte_eth_dev *eth_dev = NULL;
+ const unsigned int numa_node = dev->device.numa_node;
+ struct pmd_internals *internals = NULL;
+ int ret;
+ int i;
+
+ internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+ if (!internals)
+ return -ENOMEM;
+
+ internals->queue_idx = queue_idx;
+ internals->xsk_map_id = xsk_map_id;
+ internals->xsk_map_key_start = xsk_map_key_start;
+ internals->xsk_map_key_count = xsk_map_key_count;
+ strcpy(internals->if_name, if_name);
+
+ for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
+ internals->tx_queues[i].pair = &internals->rx_queues[i];
+ internals->rx_queues[i].pair = &internals->tx_queues[i];
+ }
+
+ ret = get_iface_info(if_name, &internals->eth_addr,
+ &internals->if_index);
+ if (ret)
+ goto err;
+
+ eth_dev = rte_eth_vdev_allocate(dev, 0);
+ if (!eth_dev)
+ goto err;
+
+ eth_dev->data->dev_private = internals;
+ eth_dev->data->dev_link = pmd_link;
+ eth_dev->data->mac_addrs = &internals->eth_addr;
+ eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = eth_af_xdp_rx;
+ eth_dev->tx_pkt_burst = eth_af_xdp_tx;
+
+ rte_eth_dev_probing_finish(eth_dev);
+ return 0;
+
+err:
+ rte_free(internals);
+ return -1;
+}
+
+static int
+rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
+{
+ struct rte_kvargs *kvlist;
+ char *if_name = NULL;
+ int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
+ struct rte_eth_dev *eth_dev;
+ int xsk_map_id = -1;
+ int xsk_map_key_start = 0;
+ int xsk_map_key_count = 1;
+ const char *name;
+ int ret;
+
+ RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
+ rte_vdev_device_name(dev));
+
+ name = rte_vdev_device_name(dev);
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
+ strlen(rte_vdev_device_args(dev)) == 0) {
+ eth_dev = rte_eth_dev_attach_secondary(name);
+ if (!eth_dev) {
+ RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
+ return -EINVAL;
+ }
+ eth_dev->dev_ops = &ops;
+ rte_eth_dev_probing_finish(eth_dev);
+ }
+
+ kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+ if (!kvlist) {
+ RTE_LOG(ERR, PMD,
+ "Invalid kvargs\n");
+ return -EINVAL;
+ }
+
+ if (dev->device.numa_node == SOCKET_ID_ANY)
+ dev->device.numa_node = rte_socket_id();
+
+ parse_parameters(kvlist, &if_name,
+ &queue_idx,
+ &xsk_map_id,
+ &xsk_map_key_start,
+ &xsk_map_key_count);
+
+ if (xsk_map_id < 0) {
+ RTE_LOG(ERR, PMD,
+ "Invalid map id\n");
+ return -EINVAL;
+ }
+ ret = init_internals(dev, if_name, queue_idx, xsk_map_id,
+ xsk_map_key_start, xsk_map_key_count);
+
+ rte_kvargs_free(kvlist);
+
+ return ret;
+}
+
+static int
+rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
+{
+ struct rte_eth_dev *eth_dev = NULL;
+ struct pmd_internals *internals;
+ int i;
+
+ RTE_LOG(INFO, PMD, "Closing AF_XDP ethdev on numa socket %u\n",
+ rte_socket_id());
+
+ if (!dev)
+ return -1;
+
+ /* find the ethdev entry */
+ eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
+ if (!eth_dev)
+ return -1;
+
+ internals = eth_dev->data->dev_private;
+
+ for (i = 0; i < internals->xsk_map_key_count; i++)
+ queue_reset(internals, i);
+
+ rte_ring_free(internals->umem_share->buf_ring);
+ rte_free(internals->umem_share->frames);
+ rte_free(internals->umem_share);
+ rte_free(internals);
+
+ rte_eth_dev_release_port(eth_dev);
+
+ return 0;
+}
+
+static struct rte_vdev_driver pmd_af_xdp_drv = {
+ .probe = rte_pmd_af_xdp_probe,
+ .remove = rte_pmd_af_xdp_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
+RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);
+RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
+ "iface=<string> "
+ "queue=<int> "
+ "xsk_map_id=<int> "
+ "xsk_map_key_start=<int> "
+ "xsk_map_key_count=<ind> ");
diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
new file mode 100644
index 000000000..ef3539840
--- /dev/null
+++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
@@ -0,0 +1,4 @@
+DPDK_2.0 {
+
+ local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index de33883be..428ad8ab0 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -118,6 +118,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL) += -lrte_mempool_dpaa2
endif
_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += -lrte_pmd_af_packet
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += -lrte_pmd_af_xdp -lelf -lbpf
_LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += -lrte_pmd_ark
_LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += -lrte_pmd_avf
_LDLIBS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += -lrte_pmd_avp
--
2.13.6
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [RFC v3 2/6] lib/mbuf: enable parse flags when create mempool
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 1/6] net/af_xdp: new PMD driver Qi Zhang
@ 2018-08-16 14:43 ` Qi Zhang
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 3/6] lib/mempool: allow page size aligned mempool Qi Zhang
` (5 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Qi Zhang @ 2018-08-16 14:43 UTC (permalink / raw)
To: dev
Cc: magnus.karlsson, bjorn.topel, jingjing.wu, xiaoyun.li,
ferruh.yigit, Qi Zhang
This give the option that applicaiton can configure each
memory chunk's size precisely. (by MEMPOOL_F_NO_SPREAD).
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
lib/librte_mbuf/rte_mbuf.c | 15 ++++++++++++---
lib/librte_mbuf/rte_mbuf.h | 8 +++++++-
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index e714c5a59..dd119f5ac 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -110,7 +110,7 @@ rte_pktmbuf_init(struct rte_mempool *mp,
struct rte_mempool *
rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n,
unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
- int socket_id, const char *ops_name)
+ unsigned int flags, int socket_id, const char *ops_name)
{
struct rte_mempool *mp;
struct rte_pktmbuf_pool_private mbp_priv;
@@ -130,7 +130,7 @@ rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n,
mbp_priv.mbuf_priv_size = priv_size;
mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
- sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
+ sizeof(struct rte_pktmbuf_pool_private), socket_id, flags);
if (mp == NULL)
return NULL;
@@ -164,9 +164,18 @@ rte_pktmbuf_pool_create(const char *name, unsigned int n,
int socket_id)
{
return rte_pktmbuf_pool_create_by_ops(name, n, cache_size, priv_size,
- data_room_size, socket_id, NULL);
+ data_room_size, 0, socket_id, NULL);
}
+/* helper to create a mbuf pool with NO_SPREAD */
+struct rte_mempool *
+rte_pktmbuf_pool_create_with_flags(const char *name, unsigned int n,
+ unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
+ unsigned int flags, int socket_id)
+{
+ return rte_pktmbuf_pool_create_by_ops(name, n, cache_size, priv_size,
+ data_room_size, flags, socket_id, NULL);
+}
/* do some sanity checks on a mbuf: panic if it fails */
void
rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header)
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 9ce5d76d7..d83d17b79 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -1127,6 +1127,12 @@ rte_pktmbuf_pool_create(const char *name, unsigned n,
unsigned cache_size, uint16_t priv_size, uint16_t data_room_size,
int socket_id);
+struct rte_mempool *
+rte_pktmbuf_pool_create_with_flags(const char *name, unsigned int n,
+ unsigned cache_size, uint16_t priv_size, uint16_t data_room_size,
+ unsigned flags, int socket_id);
+
+
/**
* Create a mbuf pool with a given mempool ops name
*
@@ -1167,7 +1173,7 @@ rte_pktmbuf_pool_create(const char *name, unsigned n,
struct rte_mempool *
rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n,
unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
- int socket_id, const char *ops_name);
+ unsigned int flags, int socket_id, const char *ops_name);
/**
* Get the data room size of mbufs stored in a pktmbuf_pool
--
2.13.6
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [RFC v3 3/6] lib/mempool: allow page size aligned mempool
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 1/6] net/af_xdp: new PMD driver Qi Zhang
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 2/6] lib/mbuf: enable parse flags when create mempool Qi Zhang
@ 2018-08-16 14:43 ` Qi Zhang
2018-08-19 6:56 ` Jerin Jacob
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 4/6] net/af_xdp: use mbuf mempool for buffer management Qi Zhang
` (4 subsequent siblings)
7 siblings, 1 reply; 11+ messages in thread
From: Qi Zhang @ 2018-08-16 14:43 UTC (permalink / raw)
To: dev
Cc: magnus.karlsson, bjorn.topel, jingjing.wu, xiaoyun.li,
ferruh.yigit, Qi Zhang
Allow create a mempool with page size aligned base address.
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
lib/librte_mempool/rte_mempool.c | 3 +++
lib/librte_mempool/rte_mempool.h | 1 +
2 files changed, 4 insertions(+)
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 03e6b5f73..61f7764c5 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -508,6 +508,9 @@ rte_mempool_populate_default(struct rte_mempool *mp)
if (try_contig)
flags |= RTE_MEMZONE_IOVA_CONTIG;
+ if (mp->flags & MEMPOOL_F_PAGE_ALIGN)
+ align = getpagesize();
+
mz = rte_memzone_reserve_aligned(mz_name, mem_size,
mp->socket_id, flags, align);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 7c9cd9a2f..75553b36f 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -264,6 +264,7 @@ struct rte_mempool {
#define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */
#define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */
#define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */
+#define MEMPOOL_F_PAGE_ALIGN 0x0040 /**< Chunk's base address is page aligned */
/**
* @internal When debug is enabled, store some statistics.
--
2.13.6
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [dpdk-dev] [RFC v3 3/6] lib/mempool: allow page size aligned mempool
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 3/6] lib/mempool: allow page size aligned mempool Qi Zhang
@ 2018-08-19 6:56 ` Jerin Jacob
0 siblings, 0 replies; 11+ messages in thread
From: Jerin Jacob @ 2018-08-19 6:56 UTC (permalink / raw)
To: Qi Zhang
Cc: dev, magnus.karlsson, bjorn.topel, jingjing.wu, xiaoyun.li, ferruh.yigit
-----Original Message-----
> Date: Thu, 16 Aug 2018 22:43:18 +0800
> From: Qi Zhang <qi.z.zhang@intel.com>
> To: dev@dpdk.org
> CC: magnus.karlsson@intel.com, bjorn.topel@intel.com,
> jingjing.wu@intel.com, xiaoyun.li@intel.com, ferruh.yigit@intel.com, Qi
> Zhang <qi.z.zhang@intel.com>
> Subject: [dpdk-dev] [RFC v3 3/6] lib/mempool: allow page size aligned
> mempool
> X-Mailer: git-send-email 2.13.6
>
>
> Allow create a mempool with page size aligned base address.
If I understand it correctly, you are doing this enable AF_XDP PMD which
has some constraints. If so, I think more transparent way to handle this
from application perspective would be to
1) add new mempool ops which based this new flags
and
2) and call rte_mbuf_set_platform_mempool_ops("new mempool op for af_xdp");
on af_xdp device probe()
That will avoid the need for new API and testpmd changes for flag
selection.
The proposed RFC has problem where, end user needs to know what flags are
relevant for certain PMD.
>
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> ---
> lib/librte_mempool/rte_mempool.c | 3 +++
> lib/librte_mempool/rte_mempool.h | 1 +
> 2 files changed, 4 insertions(+)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 03e6b5f73..61f7764c5 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -508,6 +508,9 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> if (try_contig)
> flags |= RTE_MEMZONE_IOVA_CONTIG;
>
> + if (mp->flags & MEMPOOL_F_PAGE_ALIGN)
> + align = getpagesize();
> +
> mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> mp->socket_id, flags, align);
>
> diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> index 7c9cd9a2f..75553b36f 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -264,6 +264,7 @@ struct rte_mempool {
> #define MEMPOOL_F_POOL_CREATED 0x0010 /**< Internal: pool is created. */
> #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */
> #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */
> +#define MEMPOOL_F_PAGE_ALIGN 0x0040 /**< Chunk's base address is page aligned */
>
> /**
> * @internal When debug is enabled, store some statistics.
> --
> 2.13.6
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [RFC v3 4/6] net/af_xdp: use mbuf mempool for buffer management
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
` (2 preceding siblings ...)
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 3/6] lib/mempool: allow page size aligned mempool Qi Zhang
@ 2018-08-16 14:43 ` Qi Zhang
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 5/6] net/af_xdp: enable zero copy Qi Zhang
` (3 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Qi Zhang @ 2018-08-16 14:43 UTC (permalink / raw)
To: dev
Cc: magnus.karlsson, bjorn.topel, jingjing.wu, xiaoyun.li,
ferruh.yigit, Qi Zhang
Now, af_xdp registered memory buffer is managed by rte_mempool.
mbuf be allocated from rte_mempool can be convert to xdp_desc's
address and vice versa.
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
drivers/net/af_xdp/rte_eth_af_xdp.c | 184 +++++++++++++++++++++---------------
1 file changed, 108 insertions(+), 76 deletions(-)
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
index 12252014d..69bc38536 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -42,7 +42,11 @@
#define ETH_AF_XDP_FRAME_SIZE 2048
#define ETH_AF_XDP_NUM_BUFFERS 4096
-#define ETH_AF_XDP_DATA_HEADROOM 0
+/* mempool hdrobj size (64 bytes) + sizeof(struct rte_mbuf) (128 bytes) */
+#define ETH_AF_XDP_MBUF_OVERHEAD 192
+/* data start from offset 320 (192 + 128) bytes */
+#define ETH_AF_XDP_DATA_HEADROOM \
+ (ETH_AF_XDP_MBUF_OVERHEAD + RTE_PKTMBUF_HEADROOM)
#define ETH_AF_XDP_DFLT_NUM_DESCS 1024
#define ETH_AF_XDP_FQ_NUM_DESCS 1024
#define ETH_AF_XDP_CQ_NUM_DESCS 1024
@@ -68,7 +72,7 @@ struct xdp_umem {
char *frames;
struct xdp_umem_uqueue fq;
struct xdp_umem_uqueue cq;
- struct rte_ring *buf_ring; /* be used to manage the buffer */
+ struct rte_mempool *mb_pool; /* be used to manage the buffer */
int fd;
};
@@ -304,11 +308,25 @@ static char *get_pkt_data(struct xdp_umem *umem, uint64_t addr)
return &umem->frames[addr];
}
+static inline struct rte_mbuf *
+addr_to_mbuf(struct xdp_umem *umem, uint64_t addr)
+{
+ return (struct rte_mbuf *)((uint64_t)umem->frames + addr - 0x100);
+}
+
+static inline uint64_t
+mbuf_to_addr(struct xdp_umem *umem, struct rte_mbuf *mbuf)
+{
+ return (uint64_t)mbuf->buf_addr + mbuf->data_off -
+ (uint64_t)umem->frames;
+}
+
static uint16_t
eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct xdp_desc descs[ETH_AF_XDP_RX_BATCH_SIZE];
- void *addrs[ETH_AF_XDP_RX_BATCH_SIZE];
+ struct rte_mbuf *bufs_to_fill[ETH_AF_XDP_RX_BATCH_SIZE];
+ uint64_t addrs[ETH_AF_XDP_RX_BATCH_SIZE];
struct pkt_rx_queue *rxq = queue;
struct xdp_uqueue *uq = &rxq->rx;
struct xdp_umem_uqueue *fq = &rxq->umem->fq;
@@ -317,25 +335,25 @@ eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
unsigned long dropped = 0;
unsigned long rx_bytes = 0;
uint16_t count = 0;
- int rcvd, i;
+ int rcvd, i, ret;
nb_pkts = nb_pkts < ETH_AF_XDP_RX_BATCH_SIZE ?
nb_pkts : ETH_AF_XDP_RX_BATCH_SIZE;
if (umem_nb_free(fq, free_thresh) >= free_thresh) {
- int n = rte_ring_dequeue_bulk(rxq->umem->buf_ring,
- addrs,
- ETH_AF_XDP_RX_BATCH_SIZE,
- NULL);
- if (n == 0)
+ ret = rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool,
+ bufs_to_fill,
+ ETH_AF_XDP_RX_BATCH_SIZE);
+ if (ret)
return -ENOMEM;
- if (umem_fill_to_kernel(fq, (uint64_t *)&addrs[0],
- ETH_AF_XDP_RX_BATCH_SIZE)) {
- rte_ring_enqueue_bulk(rxq->umem->buf_ring,
- addrs,
- ETH_AF_XDP_RX_BATCH_SIZE,
- NULL);
+ for (i = 0; i < ETH_AF_XDP_RX_BATCH_SIZE; i++)
+ addrs[i] = mbuf_to_addr(rxq->umem, bufs_to_fill[i]);
+
+ if (umem_fill_to_kernel(fq, addrs,
+ ETH_AF_XDP_RX_BATCH_SIZE)) {
+ for (i = 0; i < ETH_AF_XDP_RX_BATCH_SIZE; i++)
+ rte_pktmbuf_free(bufs_to_fill[i]);
}
}
@@ -361,11 +379,9 @@ eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
} else {
dropped++;
}
- addrs[i] = (void *)addr;
+ rte_pktmbuf_free(addr_to_mbuf(rxq->umem, addr));
}
- rte_ring_enqueue_bulk(rxq->umem->buf_ring, addrs, rcvd, NULL);
-
rxq->rx_pkts += (rcvd - dropped);
rxq->rx_bytes += rx_bytes;
rxq->rx_dropped += dropped;
@@ -375,11 +391,10 @@ eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
static void kick_tx(struct pkt_tx_queue *txq)
{
- void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
- struct rte_ring *buf_ring = txq->pair->umem->buf_ring;
struct xdp_umem_uqueue *cq = &txq->pair->umem->cq;
+ uint64_t addrs[ETH_AF_XDP_TX_BATCH_SIZE];
int fd = txq->pair->xsk_fd;
- int ret, n;
+ int ret, n, i;
while (1) {
@@ -398,9 +413,10 @@ static void kick_tx(struct pkt_tx_queue *txq)
n = umem_complete_from_kernel(cq,
(uint64_t *)&addrs[0],
ETH_AF_XDP_TX_BATCH_SIZE);
- if (n > 0)
- rte_ring_enqueue_bulk(buf_ring,
- addrs, n, NULL);
+ for (i = 0; i < n; i++)
+ rte_pktmbuf_free(
+ addr_to_mbuf(txq->pair->umem,
+ addrs[i]));
}
}
}
@@ -413,23 +429,21 @@ eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
struct xdp_umem_uqueue *cq = &txq->pair->umem->cq;
struct rte_mbuf *mbuf;
struct xdp_desc descs[ETH_AF_XDP_TX_BATCH_SIZE];
- void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
- uint16_t i, valid;
+ uint64_t addrs[ETH_AF_XDP_TX_BATCH_SIZE];
+ struct rte_mbuf *bufs_to_fill[ETH_AF_XDP_TX_BATCH_SIZE];
unsigned long tx_bytes = 0;
+ int i, valid, n;
nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
- int n = umem_complete_from_kernel(cq, (uint64_t *)&addrs[0],
- ETH_AF_XDP_TX_BATCH_SIZE);
- if (n > 0)
- rte_ring_enqueue_bulk(txq->pair->umem->buf_ring,
- addrs, n, NULL);
-
- nb_pkts = rte_ring_dequeue_bulk(txq->pair->umem->buf_ring, addrs,
- nb_pkts, NULL);
- if (!nb_pkts)
- return 0;
+ n = umem_complete_from_kernel(cq, addrs,
+ ETH_AF_XDP_TX_BATCH_SIZE);
+ if (n > 0) {
+ for (i = 0; i < n; i++)
+ rte_pktmbuf_free(addr_to_mbuf(txq->pair->umem,
+ addrs[i]));
+ }
valid = 0;
for (i = 0; i < nb_pkts; i++) {
@@ -438,7 +452,13 @@ eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
mbuf = bufs[i];
if (mbuf->pkt_len <= buf_len) {
- descs[valid].addr = (uint64_t)addrs[valid];
+ bufs_to_fill[valid] =
+ rte_pktmbuf_alloc(txq->pair->umem->mb_pool);
+ if (!bufs_to_fill[valid])
+ break;
+ descs[valid].addr =
+ mbuf_to_addr(txq->pair->umem,
+ bufs_to_fill[valid]);
descs[valid].len = mbuf->pkt_len;
descs[valid].options = 0;
pkt = get_pkt_data(txq->pair->umem, descs[valid].addr);
@@ -447,20 +467,20 @@ eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
valid++;
tx_bytes += mbuf->pkt_len;
}
- rte_pktmbuf_free(mbuf);
}
if (xq_enq(uq, descs, valid)) {
+ for (i = 0; i < valid; i++)
+ rte_pktmbuf_free(bufs_to_fill[i]);
+ nb_pkts = 0;
valid = 0;
tx_bytes = 0;
} else {
kick_tx(txq);
+ for (i = 0; i < nb_pkts; i++)
+ rte_pktmbuf_free(bufs[i]);
}
- if (valid < nb_pkts)
- rte_ring_enqueue_bulk(txq->pair->umem->buf_ring, &addrs[valid],
- nb_pkts - valid, NULL);
-
txq->err_pkts += (nb_pkts - valid);
txq->tx_pkts += valid;
txq->tx_bytes += tx_bytes;
@@ -472,13 +492,15 @@ static void
fill_rx_desc(struct xdp_umem *umem)
{
struct xdp_umem_uqueue *fq = &umem->fq;
- void *p = NULL;
+ struct rte_mbuf *mbuf;
+ uint64_t addr;
uint32_t i;
for (i = 0; i < fq->size / 2; i++) {
- rte_ring_dequeue(umem->buf_ring, &p);
- if (umem_fill_to_kernel(fq, (uint64_t *)&p, 1)) {
- rte_ring_enqueue(umem->buf_ring, p);
+ mbuf = rte_pktmbuf_alloc(umem->mb_pool);
+ addr = mbuf_to_addr(umem, mbuf);
+ if (umem_fill_to_kernel(fq, &addr, 1)) {
+ rte_pktmbuf_free(mbuf);
break;
}
}
@@ -597,14 +619,28 @@ eth_link_update(struct rte_eth_dev *dev __rte_unused,
static void xdp_umem_destroy(struct xdp_umem *umem)
{
- if (umem->frames)
- free(umem->frames);
- if (umem->buf_ring)
- rte_ring_free(umem->buf_ring);
+ if (umem->mb_pool)
+ rte_mempool_free(umem->mb_pool);
free(umem);
}
+static inline uint64_t get_base_addr(struct rte_mempool *mp)
+{
+ struct rte_mempool_memhdr *memhdr;
+
+ memhdr = STAILQ_FIRST(&mp->mem_list);
+ return (uint64_t)(memhdr->addr);
+}
+
+static inline uint64_t get_len(struct rte_mempool *mp)
+{
+ struct rte_mempool_memhdr *memhdr;
+
+ memhdr = STAILQ_FIRST(&mp->mem_list);
+ return (uint64_t)(memhdr->len);
+}
+
static struct xdp_umem *xdp_umem_configure(int sfd)
{
int fq_size = ETH_AF_XDP_FQ_NUM_DESCS;
@@ -612,40 +648,29 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
struct xdp_mmap_offsets off;
struct xdp_umem_reg mr;
struct xdp_umem *umem;
- char ring_name[0x100];
+ char pool_name[0x100];
socklen_t optlen;
- void *bufs = NULL;
- uint64_t i;
umem = calloc(1, sizeof(*umem));
if (!umem)
return NULL;
- snprintf(ring_name, 0x100, "%s_%d", "af_xdp_ring", sfd);
- umem->buf_ring = rte_ring_create(ring_name,
- ETH_AF_XDP_NUM_BUFFERS,
- SOCKET_ID_ANY,
- 0x0);
- if (!umem->buf_ring) {
- RTE_LOG(ERR, PMD,
- "Failed to create rte_ring\n");
- goto err;
- }
+ snprintf(pool_name, 0x100, "%s_%d", "af_xdp_ring", sfd);
+ umem->mb_pool = rte_pktmbuf_pool_create_with_flags(
+ pool_name, ETH_AF_XDP_NUM_BUFFERS,
+ 250, 0,
+ ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_MBUF_OVERHEAD,
+ MEMPOOL_F_NO_SPREAD | MEMPOOL_F_PAGE_ALIGN,
+ SOCKET_ID_ANY);
- for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
- rte_ring_enqueue(umem->buf_ring,
- (void *)(i * ETH_AF_XDP_FRAME_SIZE +
- ETH_AF_XDP_DATA_HEADROOM));
-
- if (posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
- ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE)) {
+ if (!umem->mb_pool || umem->mb_pool->nb_mem_chunks != 1) {
RTE_LOG(ERR, PMD,
- "Failed to allocate memory pool.\n");
+ "Failed to create rte_mempool\n");
goto err;
}
- mr.addr = (uint64_t)bufs;
- mr.len = ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE;
+ mr.addr = get_base_addr(umem->mb_pool);
+ mr.len = get_len(umem->mb_pool);
mr.chunk_size = ETH_AF_XDP_FRAME_SIZE;
mr.headroom = ETH_AF_XDP_DATA_HEADROOM;
@@ -717,7 +742,7 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
(uint32_t *)((uint64_t)umem->cq.map + off.cr.consumer);
umem->cq.ring = (uint64_t *)((uint64_t)umem->cq.map + off.cr.desc);
- umem->frames = bufs;
+ umem->frames = (void *)get_base_addr(umem->mb_pool);
umem->fd = sfd;
return umem;
@@ -729,7 +754,8 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
}
static int
-xsk_configure(struct pkt_rx_queue *rxq, int ring_size, struct xdp_umem *umem)
+xsk_configure(struct pkt_rx_queue *rxq, int ring_size,
+ struct xdp_umem *umem)
{
struct pkt_tx_queue *txq = rxq->pair;
struct xdp_mmap_offsets off;
@@ -863,6 +889,12 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
int xsk_key;
int map_fd;
+ if (mb_pool == NULL) {
+ RTE_LOG(ERR, PMD,
+ "Invalid mb_pool\n");
+ return -EINVAL;
+ }
+
if (dev->data->nb_rx_queues <= rx_queue_id) {
RTE_LOG(ERR, PMD,
"Invalid rx queue id: %d\n", rx_queue_id);
@@ -1222,7 +1254,7 @@ rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
for (i = 0; i < internals->xsk_map_key_count; i++)
queue_reset(internals, i);
- rte_ring_free(internals->umem_share->buf_ring);
+ rte_mempool_free(internals->umem_share->mb_pool);
rte_free(internals->umem_share->frames);
rte_free(internals->umem_share);
rte_free(internals);
--
2.13.6
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [RFC v3 5/6] net/af_xdp: enable zero copy
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
` (3 preceding siblings ...)
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 4/6] net/af_xdp: use mbuf mempool for buffer management Qi Zhang
@ 2018-08-16 14:43 ` Qi Zhang
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 6/6] app/testpmd: add mempool flags parameter Qi Zhang
` (2 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Qi Zhang @ 2018-08-16 14:43 UTC (permalink / raw)
To: dev
Cc: magnus.karlsson, bjorn.topel, jingjing.wu, xiaoyun.li,
ferruh.yigit, Qi Zhang
Try to check if external mempool (from rx_queue_setup) is fit for
af_xdp, if it is, it will be registered to af_xdp socket directly and
there will be no packet data copy on Rx and Tx.
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
drivers/net/af_xdp/rte_eth_af_xdp.c | 158 +++++++++++++++++++++++++-----------
1 file changed, 112 insertions(+), 46 deletions(-)
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
index 69bc38536..c78c66a8c 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -73,6 +73,7 @@ struct xdp_umem {
struct xdp_umem_uqueue fq;
struct xdp_umem_uqueue cq;
struct rte_mempool *mb_pool; /* be used to manage the buffer */
+ uint8_t zc;
int fd;
};
@@ -258,6 +259,7 @@ struct pkt_rx_queue {
unsigned long rx_dropped;
struct pkt_tx_queue *pair;
+ uint8_t zc;
};
struct pkt_tx_queue {
@@ -366,20 +368,24 @@ eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
char *pkt;
uint64_t addr = descs[i].addr;
- mbuf = rte_pktmbuf_alloc(rxq->mb_pool);
- rte_pktmbuf_pkt_len(mbuf) =
- rte_pktmbuf_data_len(mbuf) =
- descs[i].len;
- if (mbuf) {
- pkt = get_pkt_data(rxq->umem, addr);
- memcpy(rte_pktmbuf_mtod(mbuf, void *),
- pkt, descs[i].len);
- rx_bytes += descs[i].len;
- bufs[count++] = mbuf;
+ if (!rxq->zc) {
+ mbuf = rte_pktmbuf_alloc(rxq->mb_pool);
+ rte_pktmbuf_pkt_len(mbuf) =
+ rte_pktmbuf_data_len(mbuf) =
+ descs[i].len;
+ if (mbuf) {
+ pkt = get_pkt_data(rxq->umem, addr);
+ memcpy(rte_pktmbuf_mtod(mbuf, void *),
+ pkt, descs[i].len);
+ rx_bytes += descs[i].len;
+ bufs[count++] = mbuf;
+ } else {
+ dropped++;
+ }
+ rte_pktmbuf_free(addr_to_mbuf(rxq->umem, addr));
} else {
- dropped++;
+ bufs[count++] = addr_to_mbuf(rxq->umem, addr);
}
- rte_pktmbuf_free(addr_to_mbuf(rxq->umem, addr));
}
rxq->rx_pkts += (rcvd - dropped);
@@ -425,14 +431,17 @@ static uint16_t
eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
{
struct pkt_tx_queue *txq = queue;
+ struct xdp_umem *umem = txq->pair->umem;
struct xdp_uqueue *uq = &txq->tx;
struct xdp_umem_uqueue *cq = &txq->pair->umem->cq;
+ struct rte_mempool *mp = umem->mb_pool;
struct rte_mbuf *mbuf;
struct xdp_desc descs[ETH_AF_XDP_TX_BATCH_SIZE];
uint64_t addrs[ETH_AF_XDP_TX_BATCH_SIZE];
struct rte_mbuf *bufs_to_fill[ETH_AF_XDP_TX_BATCH_SIZE];
+ struct rte_mbuf *bufs_to_free[ETH_AF_XDP_TX_BATCH_SIZE];
unsigned long tx_bytes = 0;
- int i, valid, n;
+ int i, valid, n, free, fill;
nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
@@ -446,39 +455,57 @@ eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
}
valid = 0;
+ free = 0;
+ fill = 0;
for (i = 0; i < nb_pkts; i++) {
- char *pkt;
- unsigned int buf_len =
- ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
mbuf = bufs[i];
- if (mbuf->pkt_len <= buf_len) {
- bufs_to_fill[valid] =
- rte_pktmbuf_alloc(txq->pair->umem->mb_pool);
- if (!bufs_to_fill[valid])
- break;
- descs[valid].addr =
- mbuf_to_addr(txq->pair->umem,
- bufs_to_fill[valid]);
+ /* mbuf is in shared mempool, zero copy */
+ if (txq->pair->zc && bufs[i]->pool == mp) {
+ descs[valid].addr = mbuf_to_addr(umem, mbuf);
descs[valid].len = mbuf->pkt_len;
descs[valid].options = 0;
- pkt = get_pkt_data(txq->pair->umem, descs[valid].addr);
- memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
- descs[i].len);
valid++;
tx_bytes += mbuf->pkt_len;
+ } else {
+ char *pkt;
+ unsigned int buf_len =
+ ETH_AF_XDP_FRAME_SIZE -
+ ETH_AF_XDP_DATA_HEADROOM;
+ if (mbuf->pkt_len <= buf_len) {
+
+ bufs_to_fill[fill] = rte_pktmbuf_alloc(mp);
+ if (bufs_to_fill[fill] == NULL) {
+ bufs_to_free[free++] = mbuf;
+ continue;
+ }
+
+ descs[valid].addr =
+ mbuf_to_addr(umem, bufs_to_fill[fill]);
+ fill++;
+ descs[valid].len = mbuf->pkt_len;
+ descs[valid].options = 0;
+ pkt = get_pkt_data(umem, descs[valid].addr);
+ memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
+ descs[i].len);
+ valid++;
+ tx_bytes += mbuf->pkt_len;
+ }
+ bufs_to_free[free++] = mbuf;
}
}
if (xq_enq(uq, descs, valid)) {
- for (i = 0; i < valid; i++)
+ /* if failed, all tmp mbufs need to be free */
+ for (i = 0; i < fill; i++)
rte_pktmbuf_free(bufs_to_fill[i]);
nb_pkts = 0;
valid = 0;
tx_bytes = 0;
} else {
+ /* if passed, original mbuf need to be free */
+ for (i = 0; i < free; i++)
+ rte_pktmbuf_free(bufs_to_free[i]);
kick_tx(txq);
- for (i = 0; i < nb_pkts; i++)
- rte_pktmbuf_free(bufs[i]);
}
txq->err_pkts += (nb_pkts - valid);
@@ -641,7 +668,7 @@ static inline uint64_t get_len(struct rte_mempool *mp)
return (uint64_t)(memhdr->len);
}
-static struct xdp_umem *xdp_umem_configure(int sfd)
+static struct xdp_umem *xdp_umem_configure(int sfd, struct rte_mempool *mb_pool)
{
int fq_size = ETH_AF_XDP_FQ_NUM_DESCS;
int cq_size = ETH_AF_XDP_CQ_NUM_DESCS;
@@ -655,18 +682,24 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
if (!umem)
return NULL;
- snprintf(pool_name, 0x100, "%s_%d", "af_xdp_ring", sfd);
- umem->mb_pool = rte_pktmbuf_pool_create_with_flags(
- pool_name, ETH_AF_XDP_NUM_BUFFERS,
- 250, 0,
- ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_MBUF_OVERHEAD,
- MEMPOOL_F_NO_SPREAD | MEMPOOL_F_PAGE_ALIGN,
- SOCKET_ID_ANY);
-
- if (!umem->mb_pool || umem->mb_pool->nb_mem_chunks != 1) {
- RTE_LOG(ERR, PMD,
- "Failed to create rte_mempool\n");
- goto err;
+ if (!mb_pool) {
+ snprintf(pool_name, 0x100, "%s_%d", "af_xdp_ring", sfd);
+ umem->mb_pool = rte_pktmbuf_pool_create_with_flags(
+ pool_name, ETH_AF_XDP_NUM_BUFFERS,
+ 250, 0,
+ ETH_AF_XDP_FRAME_SIZE -
+ ETH_AF_XDP_MBUF_OVERHEAD,
+ MEMPOOL_F_NO_SPREAD | MEMPOOL_F_PAGE_ALIGN,
+ SOCKET_ID_ANY);
+
+ if (!umem->mb_pool || umem->mb_pool->nb_mem_chunks != 1) {
+ RTE_LOG(ERR, PMD,
+ "Failed to create rte_mempool\n");
+ goto err;
+ }
+ } else {
+ umem->mb_pool = mb_pool;
+ umem->zc = 1;
}
mr.addr = get_base_addr(umem->mb_pool);
@@ -753,9 +786,34 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
}
+static uint8_t
+check_mempool_zc(struct rte_mempool *mp)
+{
+ RTE_ASSERT(mp);
+
+ /* must continues */
+ if (mp->nb_mem_chunks > 1)
+ return 0;
+
+ /* check header size */
+ if (mp->header_size != RTE_CACHE_LINE_SIZE)
+ return 0;
+
+ /* check base address */
+ if ((uint64_t)get_base_addr(mp) % getpagesize() != 0)
+ return 0;
+
+ /* check chunk size */
+ if ((mp->elt_size + mp->header_size + mp->trailer_size) %
+ ETH_AF_XDP_FRAME_SIZE != 0)
+ return 0;
+
+ return 1;
+}
+
static int
xsk_configure(struct pkt_rx_queue *rxq, int ring_size,
- struct xdp_umem *umem)
+ struct xdp_umem *umem, struct rte_mempool *mb_pool)
{
struct pkt_tx_queue *txq = rxq->pair;
struct xdp_mmap_offsets off;
@@ -767,7 +825,8 @@ xsk_configure(struct pkt_rx_queue *rxq, int ring_size,
return -1;
if (!umem) {
- rxq->umem = xdp_umem_configure(rxq->xsk_fd);
+ mb_pool = check_mempool_zc(mb_pool) ? mb_pool : NULL;
+ rxq->umem = xdp_umem_configure(rxq->xsk_fd, mb_pool);
if (!rxq->umem)
goto err;
new_umem = 1;
@@ -918,7 +977,7 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
rxq->mb_pool = mb_pool;
- if (xsk_configure(rxq, nb_rx_desc, internals->umem_share)) {
+ if (xsk_configure(rxq, nb_rx_desc, internals->umem_share, mb_pool)) {
RTE_LOG(ERR, PMD,
"Failed to configure xdp socket\n");
return -EINVAL;
@@ -945,6 +1004,13 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
if (!internals->umem_share)
internals->umem_share = rxq->umem;
+ if (mb_pool == internals->umem_share->mb_pool)
+ rxq->zc = internals->umem_share->zc;
+
+ if (rxq->zc)
+ RTE_LOG(INFO, PMD,
+ "zero copy enabled on rx queue %d\n", rx_queue_id);
+
internals->umem_share_count++;
map_fd = bpf_map_get_fd_by_id(internals->xsk_map_id);
--
2.13.6
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [RFC v3 6/6] app/testpmd: add mempool flags parameter
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
` (4 preceding siblings ...)
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 5/6] net/af_xdp: enable zero copy Qi Zhang
@ 2018-08-16 14:43 ` Qi Zhang
2018-08-23 16:25 ` [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP William Tu
2018-08-25 6:11 ` Zhang, Qi Z
7 siblings, 0 replies; 11+ messages in thread
From: Qi Zhang @ 2018-08-16 14:43 UTC (permalink / raw)
To: dev
Cc: magnus.karlsson, bjorn.topel, jingjing.wu, xiaoyun.li,
ferruh.yigit, Qi Zhang
When create rte_mempool, flags can be parsed from command line.
Now, it is possible for testpmd to create a af_xdp friendly
mempool (which enable zero copy).
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
app/test-pmd/parameters.c | 12 ++++++++++++
app/test-pmd/testpmd.c | 15 +++++++++------
app/test-pmd/testpmd.h | 1 +
3 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 962fad789..a5778e1a2 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -61,6 +61,7 @@ usage(char* progname)
"--tx-first | --stats-period=PERIOD | "
"--coremask=COREMASK --portmask=PORTMASK --numa "
"--mbuf-size= | --total-num-mbufs= | "
+ "--mp-flags= | "
"--nb-cores= | --nb-ports= | "
#ifdef RTE_LIBRTE_CMDLINE
"--eth-peers-configfile= | "
@@ -105,6 +106,7 @@ usage(char* progname)
printf(" --socket-num=N: set socket from which all memory is allocated "
"in NUMA mode.\n");
printf(" --mbuf-size=N: set the data size of mbuf to N bytes.\n");
+ printf(" --mp-flags=N: set the flags when create mbuf memory pool.\n");
printf(" --total-num-mbufs=N: set the number of mbufs to be allocated "
"in mbuf pools.\n");
printf(" --max-pkt-len=N: set the maximum size of packet to N bytes.\n");
@@ -568,6 +570,7 @@ launch_args_parse(int argc, char** argv)
{ "ring-numa-config", 1, 0, 0 },
{ "socket-num", 1, 0, 0 },
{ "mbuf-size", 1, 0, 0 },
+ { "mp-flags", 1, 0, 0 },
{ "total-num-mbufs", 1, 0, 0 },
{ "max-pkt-len", 1, 0, 0 },
{ "pkt-filter-mode", 1, 0, 0 },
@@ -772,6 +775,15 @@ launch_args_parse(int argc, char** argv)
rte_exit(EXIT_FAILURE,
"mbuf-size should be > 0 and < 65536\n");
}
+ if (!strcmp(lgopts[opt_idx].name, "mp-flags")) {
+ n = atoi(optarg);
+ if (n > 0 && n <= 0xFFFF)
+ mp_flags = (uint16_t)n;
+ else
+ rte_exit(EXIT_FAILURE,
+ "mp-flags should be > 0 and < 65536\n");
+ }
+
if (!strcmp(lgopts[opt_idx].name, "total-num-mbufs")) {
n = atoi(optarg);
if (n > 1024)
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index ee48db2a3..0567cc5dd 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -173,6 +173,7 @@ uint32_t burst_tx_delay_time = BURST_TX_WAIT_US;
uint32_t burst_tx_retry_num = BURST_TX_RETRIES;
uint16_t mbuf_data_size = DEFAULT_MBUF_DATA_SIZE; /**< Mbuf data space size. */
+uint16_t mp_flags = 0; /**< flags parsed when create mempool */
uint32_t param_total_num_mbufs = 0; /**< number of mbufs in all pools - if
* specified on command-line. */
uint16_t stats_period; /**< Period to show statistics (disabled by default) */
@@ -533,6 +534,7 @@ set_def_fwd_config(void)
*/
static void
mbuf_pool_create(uint16_t mbuf_seg_size, unsigned nb_mbuf,
+ unsigned int flags,
unsigned int socket_id)
{
char pool_name[RTE_MEMPOOL_NAMESIZE];
@@ -550,7 +552,7 @@ mbuf_pool_create(uint16_t mbuf_seg_size, unsigned nb_mbuf,
rte_mp = rte_mempool_create_empty(pool_name, nb_mbuf,
mb_size, (unsigned) mb_mempool_cache,
sizeof(struct rte_pktmbuf_pool_private),
- socket_id, 0);
+ socket_id, flags);
if (rte_mp == NULL)
goto err;
@@ -565,8 +567,8 @@ mbuf_pool_create(uint16_t mbuf_seg_size, unsigned nb_mbuf,
/* wrapper to rte_mempool_create() */
TESTPMD_LOG(INFO, "preferred mempool ops selected: %s\n",
rte_mbuf_best_mempool_ops());
- rte_mp = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
- mb_mempool_cache, 0, mbuf_seg_size, socket_id);
+ rte_mp = rte_pktmbuf_pool_create_with_flags(pool_name, nb_mbuf,
+ mb_mempool_cache, 0, mbuf_seg_size, flags, socket_id);
}
err:
@@ -797,13 +799,14 @@ init_config(void)
for (i = 0; i < num_sockets; i++)
mbuf_pool_create(mbuf_data_size, nb_mbuf_per_pool,
- socket_ids[i]);
+ mp_flags, socket_ids[i]);
} else {
if (socket_num == UMA_NO_CONFIG)
- mbuf_pool_create(mbuf_data_size, nb_mbuf_per_pool, 0);
+ mbuf_pool_create(mbuf_data_size, nb_mbuf_per_pool,
+ mp_flags, 0);
else
mbuf_pool_create(mbuf_data_size, nb_mbuf_per_pool,
- socket_num);
+ mp_flags, socket_num);
}
init_port_config();
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index a1f661472..f5f8692ea 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -379,6 +379,7 @@ extern uint8_t dcb_config;
extern uint8_t dcb_test;
extern uint16_t mbuf_data_size; /**< Mbuf data space size. */
+extern uint16_t mp_flags; /**< flags for mempool creation. */
extern uint32_t param_total_num_mbufs;
extern uint16_t stats_period;
--
2.13.6
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
` (5 preceding siblings ...)
2018-08-16 14:43 ` [dpdk-dev] [RFC v3 6/6] app/testpmd: add mempool flags parameter Qi Zhang
@ 2018-08-23 16:25 ` William Tu
2018-08-28 14:11 ` Zhang, Qi Z
2018-08-25 6:11 ` Zhang, Qi Z
7 siblings, 1 reply; 11+ messages in thread
From: William Tu @ 2018-08-23 16:25 UTC (permalink / raw)
To: Zhang, Qi Z
Cc: dev, Karlsson, Magnus, Björn Töpel, jingjing.wu,
xiaoyun.li, ferruh.yigit
Hi Zhang Qi,
I'm not familiar with DPDK code, but I'm curious about the
benefits of using AF_XDP pmd, specifically I have a couple questions:
1) With zero-copy driver support, is AF_XDP pmd expects to have
similar performance than other pmd? Since AF_XDP is still using
native device driver, isn't the interrupt still there and not "poll-mode"
anymore?
2) does the patch expect user to customize the ebpf/xdp code
so that this becomes another way to extend dpdk datapath?
Thank you
William
On Thu, Aug 16, 2018 at 7:42 AM Qi Zhang <qi.z.zhang@intel.com> wrote:
>
> Overview
> ========
>
> The patch set add a new PMD driver for AF_XDP which is a proposed
> faster version of AF_PACKET interface in Linux, see below link for
> detail AF_XDP introduction:
> https://lwn.net/Articles/750845/
> https://fosdem.org/2018/schedule/event/af_xdp/
>
> AF_XDP roadmap
> ==============
> - The kernel 4.18 is out and af_xdp is included.
> https://kernelnewbies.org/Linux_4.18
> - So far there is no zero copy supported driver be merged, but some are
> on the way.
>
> Change logs
> ===========
>
> v3:
> - Re-work base on AF_XDP's interface changes.
> - Support multi-queues, each dpdk queue has its own xdp socket.
> An xdp socket is always bound to a netdev queue.
> We assume all xdp socket from the same ethdev are bound to the
> same netdev queue, though a netdev queue still can be bound by
> xdp sockets from different ethdev instances.
> Below is an example of the mapping.
> ------------------------------------------------------
> | dpdk q0 | dpdk q1 | dpdk q0 | dpdk q0 | dpdk q1 |
> ------------------------------------------------------
> | xsk A | xsk B | xsk C | xsk D | xsk E |<---|
> ------------------------------------------------------ |
> | ETHDEV 0 | ETHDEV 1 | ETHDEV 2 | | DPDK
> ------------------------------------------------------------------
> | netdev queue 0 | netdev queue 1 | | KERNEL
> ------------------------------------------------------ |
> | NETDEV eth0 | |
> ------------------------------------------------------ |
> | key xsk | |
> | ---------- -------------- | |
> | | | | 0 | xsk A | | |
> | | | -------------- | |
> | | | | 2 | xsk B | | |
> | | ebpf | ---------------------------------------
> | | | | 3 | xsk C | |
> | | redirect ->|-------------- |
> | | | | 4 | xsk D | |
> | | | -------------- |
> | |---------| | 5 | xsk E | |
> | -------------- |
> |-----------------------------------------------------
>
> - It is an open question that how to load ebpf to kernel and link to
> specific netdev in DPDK, should it be part of PMD, or it should be handled by
> an independent tool? In this patchset, it takes the second option, there will
> be a "bind" stage before we start AF_XDP PMD, this includes below steps:
> a) load ebpf program to the kernel, (the ebpf program must contain the
> logic to redirect packet to a xdp socket base on a redirect map).
> b) link ebpf program to specific network interface.
> c) expose the xdp socket redirect map id and entries number to user,
> so this will be parsed to PMD, and PMD will create xdp socket
> for each queue and update the redirect map correctly.
> (example: --vdev,iface=eth0,xsk_map_id=53,xsk_map_key_base=0,xsk_map_key_count=4)
>
> v2:
> - fix lisence header
> - clean up bpf dependency, bpf program is embedded, no "xdpsock_kern.o"
> required
> - clean up make file, only linux_header is required
> - fix all the compile warning.
> - fix packet number return in Tx.
>
> How to try
> ==========
>
> 1. Take the kernel v4.18.
> make sure you turn on XDP sockets when compiling
> Networking support -->
> Networking options -->
> [ * ] XDP sockets
> 2. in the kernel source code, apply below patch and compile the bpf sample code.
> #make samples/bpf/
> so the sample xdpsock can be used as a bind/unbind tool for af_xdp
> PMD, sorry for this ugly, but in future, there could be a dedicated
> tool in DPDK, if we agree with the idea that bpf configure in the kernel
> should be separated from PMD.
>
> ~~~~~~~~~~~~~~~~~~~~~~~PATCH START~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
> index d69c8d78d3fd..44a6318043e7 100644
> --- a/samples/bpf/xdpsock_user.c
> +++ b/samples/bpf/xdpsock_user.c
> @@ -76,6 +76,8 @@ static int opt_poll;
> static int opt_shared_packet_buffer;
> static int opt_interval = 1;
> static u32 opt_xdp_bind_flags;
> +static int opt_bind;
> +static int opt_unbind;
>
> struct xdp_umem_uqueue {
> u32 cached_prod;
> @@ -662,6 +664,8 @@ static void usage(const char *prog)
> " -S, --xdp-skb=n Use XDP skb-mod\n"
> " -N, --xdp-native=n Enfore XDP native mode\n"
> " -n, --interval=n Specify statistics update interval (default 1 sec).\n"
> + " -b, --bind Bind only.\n"
> + " -u, --unbind Unbind only.\n"
> "\n";
> fprintf(stderr, str, prog);
> exit(EXIT_FAILURE);
> @@ -674,7 +678,7 @@ static void parse_command_line(int argc, char **argv)
> opterr = 0;
>
> for (;;) {
> - c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
> + c = getopt_long(argc, argv, "rtli:q:psSNn:bu", long_options,
> &option_index);
> if (c == -1)
> break;
> @@ -711,6 +715,12 @@ static void parse_command_line(int argc, char **argv)
> case 'n':
> opt_interval = atoi(optarg);
> break;
> + case 'b':
> + opt_bind = 1;
> + break;
> + case 'u':
> + opt_unbind = 1;
> + break;
> default:
> usage(basename(argv[0]));
> }
> @@ -898,6 +908,12 @@ int main(int argc, char **argv)
> exit(EXIT_FAILURE);
> }
>
> + if (opt_unbind) {
> + bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
>
> ~~~~~~~~~~~~~~~~~~~~~~~PATCH END~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> 3. bind
> #./samples/bpf/xdpsock -i eth0 -b
>
> in this step, an ebpf binary xdpsock_kern.o is be loaded into the kernel
> and linked to eth0, the ebpf source code is /samples/bpf/xdpsock_kern.c
> you can modify it and re-compile for a different test.
>
> 4. dump xdp socket map information.
> #./tools/bpf/bpftool/bpftool map -p, you will see something like below.
>
> },{
> "id": 56,
> "type": "xskmap",
> "name": "xsks_map",
> "flags": 0,
> "bytes_key": 4,
> "bytes_value": 4,
> "max_entries": 4,
> "bytes_memlock": 4096
> }
>
> in this case 56 is the map id and it has 4 entries
>
> 5. start testpmd
>
> ./build/app/testpmd -c 0xc -n 4 --vdev eth_af_xdp,iface=enp59s0f0,xsk_map_id=56,xsk_map_key_start=2xsk_map_key_count=2 -- -i --rxq=2 --txq=2
>
> in this case, we reserved 2 entries (2,3) in the map, and they will be mapped to queue 0 and queue 1.
>
> 6. unbind after test
> ./sample/bpf/xdpsock -i eth0 -u.
>
> Performance
> ===========
> Since no zero copy driver is ready yet.
> So far only tested with DRV and SKB mode on i40e 25G
> the result show identical with kernel sample "xdpsock"
>
> Qi Zhang (6):
> net/af_xdp: new PMD driver
> lib/mbuf: enable parse flags when create mempool
> lib/mempool: allow page size aligned mempool
> net/af_xdp: use mbuf mempool for buffer management
> net/af_xdp: enable zero copy
> app/testpmd: add mempool flags parameter
>
> app/test-pmd/parameters.c | 12 +
> app/test-pmd/testpmd.c | 15 +-
> app/test-pmd/testpmd.h | 1 +
> config/common_base | 5 +
> config/common_linuxapp | 1 +
> drivers/net/Makefile | 1 +
> drivers/net/af_xdp/Makefile | 30 +
> drivers/net/af_xdp/meson.build | 7 +
> drivers/net/af_xdp/rte_eth_af_xdp.c | 1345 +++++++++++++++++++++++++
> drivers/net/af_xdp/rte_pmd_af_xdp_version.map | 4 +
> lib/librte_mbuf/rte_mbuf.c | 15 +-
> lib/librte_mbuf/rte_mbuf.h | 8 +-
> lib/librte_mempool/rte_mempool.c | 3 +
> lib/librte_mempool/rte_mempool.h | 1 +
> mk/rte.app.mk | 1 +
> 15 files changed, 1439 insertions(+), 10 deletions(-)
> create mode 100644 drivers/net/af_xdp/Makefile
> create mode 100644 drivers/net/af_xdp/meson.build
> create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
> create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>
> --
> 2.13.6
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP
2018-08-23 16:25 ` [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP William Tu
@ 2018-08-28 14:11 ` Zhang, Qi Z
0 siblings, 0 replies; 11+ messages in thread
From: Zhang, Qi Z @ 2018-08-28 14:11 UTC (permalink / raw)
To: William Tu
Cc: dev, Karlsson, Magnus, Topel, Bjorn, Wu, Jingjing, Li, Xiaoyun,
Yigit, Ferruh
Hi William:
> -----Original Message-----
> From: William Tu [mailto:u9012063@gmail.com]
> Sent: Friday, August 24, 2018 12:25 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: dev@dpdk.org; Karlsson, Magnus <magnus.karlsson@intel.com>; Topel,
> Bjorn <bjorn.topel@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Li,
> Xiaoyun <xiaoyun.li@intel.com>; Yigit, Ferruh <ferruh.yigit@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP
>
> Hi Zhang Qi,
>
> I'm not familiar with DPDK code, but I'm curious about the benefits of using
> AF_XDP pmd, specifically I have a couple questions:
>
> 1) With zero-copy driver support, is AF_XDP pmd expects to have similar
> performance than other pmd?
Zero-copy will improve performance a lot, but it still have gap with native DPDK PMD.
basically it's kind of less performance but more flexible solution.
BTW, Patches to enable zero copy for i40e just be published by Bjorn, there is some performance data for your reference.
http://lists.openwall.net/netdev/2018/08/28/62
> Since AF_XDP is still using native device driver,
> isn't the interrupt still there and not "poll-mode"
> anymore?
Yes, it's still napi->poll triggered by interrupt.
>
> 2) does the patch expect user to customize the ebpf/xdp code so that this
> becomes another way to extend dpdk datapath?
Yes, this provides another option to use kernel's eBPF eco-system for packet filtering,.
And it will be easy for us to develop some tool to load/link/expose ebpf as part of DPDK I think.
According to AF_XDP PMD, my view is since DPDK is very popular, it is becoming some standard way to develop network applications.
So a DPDK PMD is going to be a bridge for developers to take advantage of the AF_XDP technology if compared to deal with the XDP Socket and libc directly.
Regards
Qi
>
> Thank you
> William
>
> On Thu, Aug 16, 2018 at 7:42 AM Qi Zhang <qi.z.zhang@intel.com> wrote:
> >
> > Overview
> > ========
> >
> > The patch set add a new PMD driver for AF_XDP which is a proposed
> > faster version of AF_PACKET interface in Linux, see below link for
> > detail AF_XDP introduction:
> > https://lwn.net/Articles/750845/
> > https://fosdem.org/2018/schedule/event/af_xdp/
> >
> > AF_XDP roadmap
> > ==============
> > - The kernel 4.18 is out and af_xdp is included.
> > https://kernelnewbies.org/Linux_4.18
> > - So far there is no zero copy supported driver be merged, but some are
> > on the way.
> >
> > Change logs
> > ===========
> >
> > v3:
> > - Re-work base on AF_XDP's interface changes.
> > - Support multi-queues, each dpdk queue has its own xdp socket.
> > An xdp socket is always bound to a netdev queue.
> > We assume all xdp socket from the same ethdev are bound to the
> > same netdev queue, though a netdev queue still can be bound by
> > xdp sockets from different ethdev instances.
> > Below is an example of the mapping.
> > ------------------------------------------------------
> > | dpdk q0 | dpdk q1 | dpdk q0 | dpdk q0 | dpdk q1 |
> > ------------------------------------------------------
> > | xsk A | xsk B | xsk C | xsk D | xsk E |<---|
> > ------------------------------------------------------ |
> > | ETHDEV 0 | ETHDEV 1 | ETHDEV 2 | |
> DPDK
> > ------------------------------------------------------------------
> > | netdev queue 0 | netdev queue 1 | |
> KERNEL
> > ------------------------------------------------------ |
> > | NETDEV eth0 | |
> > ------------------------------------------------------ |
> > | key xsk | |
> > | ---------- -------------- | |
> > | | | | 0 | xsk A | | |
> > | | | -------------- | |
> > | | | | 2 | xsk B | | |
> > | | ebpf | ---------------------------------------
> > | | | | 3 | xsk C | |
> > | | redirect ->|-------------- |
> > | | | | 4 | xsk D | |
> > | | | -------------- |
> > | |---------| | 5 | xsk E | |
> > | -------------- |
> > |-----------------------------------------------------
> >
> > - It is an open question that how to load ebpf to kernel and link to
> > specific netdev in DPDK, should it be part of PMD, or it should be
> handled by
> > an independent tool? In this patchset, it takes the second option, there
> will
> > be a "bind" stage before we start AF_XDP PMD, this includes below
> steps:
> > a) load ebpf program to the kernel, (the ebpf program must contain the
> > logic to redirect packet to a xdp socket base on a redirect map).
> > b) link ebpf program to specific network interface.
> > c) expose the xdp socket redirect map id and entries number to user,
> > so this will be parsed to PMD, and PMD will create xdp socket
> > for each queue and update the redirect map correctly.
> > (example:
> >
> --vdev,iface=eth0,xsk_map_id=53,xsk_map_key_base=0,xsk_map_key_count
> =4
> > )
> >
> > v2:
> > - fix lisence header
> > - clean up bpf dependency, bpf program is embedded, no
> "xdpsock_kern.o"
> > required
> > - clean up make file, only linux_header is required
> > - fix all the compile warning.
> > - fix packet number return in Tx.
> >
> > How to try
> > ==========
> >
> > 1. Take the kernel v4.18.
> > make sure you turn on XDP sockets when compiling
> > Networking support -->
> > Networking options -->
> > [ * ] XDP sockets
> > 2. in the kernel source code, apply below patch and compile the bpf sample
> code.
> > #make samples/bpf/
> > so the sample xdpsock can be used as a bind/unbind tool for af_xdp
> > PMD, sorry for this ugly, but in future, there could be a dedicated
> > tool in DPDK, if we agree with the idea that bpf configure in the kernel
> > should be separated from PMD.
> >
> > ~~~~~~~~~~~~~~~~~~~~~~~PATCH
> > START~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >
> > diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
> > index d69c8d78d3fd..44a6318043e7 100644
> > --- a/samples/bpf/xdpsock_user.c
> > +++ b/samples/bpf/xdpsock_user.c
> > @@ -76,6 +76,8 @@ static int opt_poll; static int
> > opt_shared_packet_buffer; static int opt_interval = 1; static u32
> > opt_xdp_bind_flags;
> > +static int opt_bind;
> > +static int opt_unbind;
> >
> > struct xdp_umem_uqueue {
> > u32 cached_prod;
> > @@ -662,6 +664,8 @@ static void usage(const char *prog)
> > " -S, --xdp-skb=n Use XDP skb-mod\n"
> > " -N, --xdp-native=n Enfore XDP native mode\n"
> > " -n, --interval=n Specify statistics update
> interval (default 1 sec).\n"
> > + " -b, --bind Bind only.\n"
> > + " -u, --unbind Unbind only.\n"
> > "\n";
> > fprintf(stderr, str, prog);
> > exit(EXIT_FAILURE);
> > @@ -674,7 +678,7 @@ static void parse_command_line(int argc, char
> **argv)
> > opterr = 0;
> >
> > for (;;) {
> > - c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
> > + c = getopt_long(argc, argv, "rtli:q:psSNn:bu",
> > + long_options,
> > &option_index);
> > if (c == -1)
> > break;
> > @@ -711,6 +715,12 @@ static void parse_command_line(int argc, char
> **argv)
> > case 'n':
> > opt_interval = atoi(optarg);
> > break;
> > + case 'b':
> > + opt_bind = 1;
> > + break;
> > + case 'u':
> > + opt_unbind = 1;
> > + break;
> > default:
> > usage(basename(argv[0]));
> > }
> > @@ -898,6 +908,12 @@ int main(int argc, char **argv)
> > exit(EXIT_FAILURE);
> > }
> >
> > + if (opt_unbind) {
> > + bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
> >
> > ~~~~~~~~~~~~~~~~~~~~~~~PATCH
> > END~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >
> > 3. bind
> > #./samples/bpf/xdpsock -i eth0 -b
> >
> > in this step, an ebpf binary xdpsock_kern.o is be loaded into the kernel
> > and linked to eth0, the ebpf source code is
> /samples/bpf/xdpsock_kern.c
> > you can modify it and re-compile for a different test.
> >
> > 4. dump xdp socket map information.
> > #./tools/bpf/bpftool/bpftool map -p, you will see something like below.
> >
> > },{
> > "id": 56,
> > "type": "xskmap",
> > "name": "xsks_map",
> > "flags": 0,
> > "bytes_key": 4,
> > "bytes_value": 4,
> > "max_entries": 4,
> > "bytes_memlock": 4096
> > }
> >
> > in this case 56 is the map id and it has 4 entries
> >
> > 5. start testpmd
> >
> > ./build/app/testpmd -c 0xc -n 4 --vdev
> >
> eth_af_xdp,iface=enp59s0f0,xsk_map_id=56,xsk_map_key_start=2xsk_map_
> ke
> > y_count=2 -- -i --rxq=2 --txq=2
> >
> > in this case, we reserved 2 entries (2,3) in the map, and they will be
> mapped to queue 0 and queue 1.
> >
> > 6. unbind after test
> > ./sample/bpf/xdpsock -i eth0 -u.
> >
> > Performance
> > ===========
> > Since no zero copy driver is ready yet.
> > So far only tested with DRV and SKB mode on i40e 25G the result show
> > identical with kernel sample "xdpsock"
> >
> > Qi Zhang (6):
> > net/af_xdp: new PMD driver
> > lib/mbuf: enable parse flags when create mempool
> > lib/mempool: allow page size aligned mempool
> > net/af_xdp: use mbuf mempool for buffer management
> > net/af_xdp: enable zero copy
> > app/testpmd: add mempool flags parameter
> >
> > app/test-pmd/parameters.c | 12 +
> > app/test-pmd/testpmd.c | 15 +-
> > app/test-pmd/testpmd.h | 1 +
> > config/common_base | 5 +
> > config/common_linuxapp | 1 +
> > drivers/net/Makefile | 1 +
> > drivers/net/af_xdp/Makefile | 30 +
> > drivers/net/af_xdp/meson.build | 7 +
> > drivers/net/af_xdp/rte_eth_af_xdp.c | 1345
> +++++++++++++++++++++++++
> > drivers/net/af_xdp/rte_pmd_af_xdp_version.map | 4 +
> > lib/librte_mbuf/rte_mbuf.c | 15 +-
> > lib/librte_mbuf/rte_mbuf.h | 8 +-
> > lib/librte_mempool/rte_mempool.c | 3 +
> > lib/librte_mempool/rte_mempool.h | 1 +
> > mk/rte.app.mk | 1 +
> > 15 files changed, 1439 insertions(+), 10 deletions(-) create mode
> > 100644 drivers/net/af_xdp/Makefile create mode 100644
> > drivers/net/af_xdp/meson.build create mode 100644
> > drivers/net/af_xdp/rte_eth_af_xdp.c
> > create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> >
> > --
> > 2.13.6
> >
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP
2018-08-16 14:43 [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP Qi Zhang
` (6 preceding siblings ...)
2018-08-23 16:25 ` [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP William Tu
@ 2018-08-25 6:11 ` Zhang, Qi Z
7 siblings, 0 replies; 11+ messages in thread
From: Zhang, Qi Z @ 2018-08-25 6:11 UTC (permalink / raw)
To: dev
Cc: Karlsson, Magnus, Topel, Bjorn, Wu, Jingjing, Li, Xiaoyun, Yigit, Ferruh
Sorry, the patch for kernel sample code is not complete. It should be as below
~~~~~~~~~~~~~~~~~~~~~~~PATCH START ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index d69c8d78d3fd..44a6318043e7 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -76,6 +76,8 @@ static int opt_poll;
static int opt_shared_packet_buffer;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags;
+static int opt_bind;
+static int opt_unbind;
struct xdp_umem_uqueue {
u32 cached_prod;
@@ -662,6 +664,8 @@ static void usage(const char *prog)
" -S, --xdp-skb=n Use XDP skb-mod\n"
" -N, --xdp-native=n Enfore XDP native mode\n"
" -n, --interval=n Specify statistics update interval (default 1 sec).\n"
+ " -b, --bind Bind only.\n"
+ " -u, --unbind Unbind only.\n"
"\n";
fprintf(stderr, str, prog);
exit(EXIT_FAILURE);
@@ -674,7 +678,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
- c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
+ c = getopt_long(argc, argv, "rtli:q:psSNn:bu", long_options,
&option_index);
if (c == -1)
break;
@@ -711,6 +715,12 @@ static void parse_command_line(int argc, char **argv)
case 'n':
opt_interval = atoi(optarg);
break;
+ case 'b':
+ opt_bind = 1;
+ break;
+ case 'u':
+ opt_unbind = 1;
+ break;
default:
usage(basename(argv[0]));
}
@@ -898,6 +908,12 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
+ if (opt_unbind) {
+ bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+ printf("unbind.\n");
+ return 0;
+ }
+
snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
if (load_bpf_file(xdp_filename)) {
@@ -922,6 +938,11 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
+ if (opt_bind) {
+ printf("bind.\n");
+ return 0;
+ }
+
/* Create sockets... */
xsks[num_socks++] = xsk_configure(NULL);
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PATCH END~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BTW, there is bug in kernel bpftool on 4.18, it will cause segment fault when you try to dump bpf map with
#./tools/bpf/bpftool/bpftool map -p
So, please also apply below patch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PATCH START ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 097b1a5e046b..0c661de58976 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -67,6 +67,7 @@ static const char * const map_type_name[] = {
[BPF_MAP_TYPE_SOCKMAP] = "sockmap",
[BPF_MAP_TYPE_CPUMAP] = "cpumap",
[BPF_MAP_TYPE_SOCKHASH] = "sockhash",
+ [BPF_MAP_TYPE_XSKMAP] = "xskmap"
};
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PATCH END~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Thursday, August 16, 2018 10:43 PM
> To: dev@dpdk.org
> Cc: Karlsson, Magnus <magnus.karlsson@intel.com>; Topel, Bjorn
> <bjorn.topel@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Li, Xiaoyun
> <xiaoyun.li@intel.com>; Yigit, Ferruh <ferruh.yigit@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH v3 0/6] PMD driver for AF_XDP
>
> Overview
> ========
>
> The patch set add a new PMD driver for AF_XDP which is a proposed faster
> version of AF_PACKET interface in Linux, see below link for detail AF_XDP
> introduction:
> https://lwn.net/Articles/750845/
> https://fosdem.org/2018/schedule/event/af_xdp/
>
> AF_XDP roadmap
> ==============
> - The kernel 4.18 is out and af_xdp is included.
> https://kernelnewbies.org/Linux_4.18
> - So far there is no zero copy supported driver be merged, but some are
> on the way.
>
> Change logs
> ===========
>
> v3:
> - Re-work base on AF_XDP's interface changes.
> - Support multi-queues, each dpdk queue has its own xdp socket.
> An xdp socket is always bound to a netdev queue.
> We assume all xdp socket from the same ethdev are bound to the
> same netdev queue, though a netdev queue still can be bound by
> xdp sockets from different ethdev instances.
> Below is an example of the mapping.
> ------------------------------------------------------
> | dpdk q0 | dpdk q1 | dpdk q0 | dpdk q0 | dpdk q1 |
> ------------------------------------------------------
> | xsk A | xsk B | xsk C | xsk D | xsk E |<---|
> ------------------------------------------------------ |
> | ETHDEV 0 | ETHDEV 1 | ETHDEV 2 | |
> DPDK
> ------------------------------------------------------------------
> | netdev queue 0 | netdev queue 1 | |
> KERNEL
> ------------------------------------------------------ |
> | NETDEV eth0 | |
> ------------------------------------------------------ |
> | key xsk | |
> | ---------- -------------- | |
> | | | | 0 | xsk A | | |
> | | | -------------- | |
> | | | | 2 | xsk B | | |
> | | ebpf | ---------------------------------------
> | | | | 3 | xsk C | |
> | | redirect ->|-------------- |
> | | | | 4 | xsk D | |
> | | | -------------- |
> | |---------| | 5 | xsk E | |
> | -------------- |
> |-----------------------------------------------------
>
> - It is an open question that how to load ebpf to kernel and link to
> specific netdev in DPDK, should it be part of PMD, or it should be handled
> by
> an independent tool? In this patchset, it takes the second option, there will
> be a "bind" stage before we start AF_XDP PMD, this includes below steps:
> a) load ebpf program to the kernel, (the ebpf program must contain the
> logic to redirect packet to a xdp socket base on a redirect map).
> b) link ebpf program to specific network interface.
> c) expose the xdp socket redirect map id and entries number to user,
> so this will be parsed to PMD, and PMD will create xdp socket
> for each queue and update the redirect map correctly.
> (example:
> --vdev,iface=eth0,xsk_map_id=53,xsk_map_key_base=0,xsk_map_key_count
> =4)
>
> v2:
> - fix lisence header
> - clean up bpf dependency, bpf program is embedded, no
> "xdpsock_kern.o"
> required
> - clean up make file, only linux_header is required
> - fix all the compile warning.
> - fix packet number return in Tx.
>
> How to try
> ==========
>
> 1. Take the kernel v4.18.
> make sure you turn on XDP sockets when compiling
> Networking support -->
> Networking options -->
> [ * ] XDP sockets
> 2. in the kernel source code, apply below patch and compile the bpf sample
> code.
> #make samples/bpf/
> so the sample xdpsock can be used as a bind/unbind tool for af_xdp
> PMD, sorry for this ugly, but in future, there could be a dedicated
> tool in DPDK, if we agree with the idea that bpf configure in the kernel
> should be separated from PMD.
>
> ~~~~~~~~~~~~~~~~~~~~~~~PATCH
> START~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index
> d69c8d78d3fd..44a6318043e7 100644
> --- a/samples/bpf/xdpsock_user.c
> +++ b/samples/bpf/xdpsock_user.c
> @@ -76,6 +76,8 @@ static int opt_poll;
> static int opt_shared_packet_buffer;
> static int opt_interval = 1;
> static u32 opt_xdp_bind_flags;
> +static int opt_bind;
> +static int opt_unbind;
>
> struct xdp_umem_uqueue {
> u32 cached_prod;
> @@ -662,6 +664,8 @@ static void usage(const char *prog)
> " -S, --xdp-skb=n Use XDP skb-mod\n"
> " -N, --xdp-native=n Enfore XDP native mode\n"
> " -n, --interval=n Specify statistics update interval (default 1
> sec).\n"
> + " -b, --bind Bind only.\n"
> + " -u, --unbind Unbind only.\n"
> "\n";
> fprintf(stderr, str, prog);
> exit(EXIT_FAILURE);
> @@ -674,7 +678,7 @@ static void parse_command_line(int argc, char
> **argv)
> opterr = 0;
>
> for (;;) {
> - c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
> + c = getopt_long(argc, argv, "rtli:q:psSNn:bu", long_options,
> &option_index);
> if (c == -1)
> break;
> @@ -711,6 +715,12 @@ static void parse_command_line(int argc, char
> **argv)
> case 'n':
> opt_interval = atoi(optarg);
> break;
> + case 'b':
> + opt_bind = 1;
> + break;
> + case 'u':
> + opt_unbind = 1;
> + break;
> default:
> usage(basename(argv[0]));
> }
> @@ -898,6 +908,12 @@ int main(int argc, char **argv)
> exit(EXIT_FAILURE);
> }
>
> + if (opt_unbind) {
> + bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
>
> ~~~~~~~~~~~~~~~~~~~~~~~PATCH
> END~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> 3. bind
> #./samples/bpf/xdpsock -i eth0 -b
>
> in this step, an ebpf binary xdpsock_kern.o is be loaded into the kernel
> and linked to eth0, the ebpf source code is /samples/bpf/xdpsock_kern.c
> you can modify it and re-compile for a different test.
>
> 4. dump xdp socket map information.
> #./tools/bpf/bpftool/bpftool map -p, you will see something like below.
>
> },{
> "id": 56,
> "type": "xskmap",
> "name": "xsks_map",
> "flags": 0,
> "bytes_key": 4,
> "bytes_value": 4,
> "max_entries": 4,
> "bytes_memlock": 4096
> }
>
> in this case 56 is the map id and it has 4 entries
>
> 5. start testpmd
>
> ./build/app/testpmd -c 0xc -n 4 --vdev
> eth_af_xdp,iface=enp59s0f0,xsk_map_id=56,xsk_map_key_start=2xsk_map_
> key_count=2 -- -i --rxq=2 --txq=2
>
> in this case, we reserved 2 entries (2,3) in the map, and they will be
> mapped to queue 0 and queue 1.
>
> 6. unbind after test
> ./sample/bpf/xdpsock -i eth0 -u.
>
> Performance
> ===========
> Since no zero copy driver is ready yet.
> So far only tested with DRV and SKB mode on i40e 25G the result show
> identical with kernel sample "xdpsock"
>
> Qi Zhang (6):
> net/af_xdp: new PMD driver
> lib/mbuf: enable parse flags when create mempool
> lib/mempool: allow page size aligned mempool
> net/af_xdp: use mbuf mempool for buffer management
> net/af_xdp: enable zero copy
> app/testpmd: add mempool flags parameter
>
> app/test-pmd/parameters.c | 12 +
> app/test-pmd/testpmd.c | 15 +-
> app/test-pmd/testpmd.h | 1 +
> config/common_base | 5 +
> config/common_linuxapp | 1 +
> drivers/net/Makefile | 1 +
> drivers/net/af_xdp/Makefile | 30 +
> drivers/net/af_xdp/meson.build | 7 +
> drivers/net/af_xdp/rte_eth_af_xdp.c | 1345
> +++++++++++++++++++++++++
> drivers/net/af_xdp/rte_pmd_af_xdp_version.map | 4 +
> lib/librte_mbuf/rte_mbuf.c | 15 +-
> lib/librte_mbuf/rte_mbuf.h | 8 +-
> lib/librte_mempool/rte_mempool.c | 3 +
> lib/librte_mempool/rte_mempool.h | 1 +
> mk/rte.app.mk | 1 +
> 15 files changed, 1439 insertions(+), 10 deletions(-) create mode 100644
> drivers/net/af_xdp/Makefile create mode 100644
> drivers/net/af_xdp/meson.build create mode 100644
> drivers/net/af_xdp/rte_eth_af_xdp.c
> create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>
> --
> 2.13.6
^ permalink raw reply [flat|nested] 11+ messages in thread