DPDK patches and discussions
 help / color / mirror / Atom feed
From: Ophir Munk <ophirmu@mellanox.com>
To: dev@dpdk.org, Pascal Mazon <pascal.mazon@6wind.com>
Cc: Thomas Monjalon <thomas@monjalon.net>,
	Olga Shern <olgas@mellanox.com>,
	Ophir Munk <ophirmu@mellanox.com>
Subject: [dpdk-dev] [PATCH v1 2/2] net/tap: support TSO (TCP Segment Offload)
Date: Mon,  9 Apr 2018 22:33:12 +0000	[thread overview]
Message-ID: <1523313192-18048-3-git-send-email-ophirmu@mellanox.com> (raw)
In-Reply-To: <1523313192-18048-1-git-send-email-ophirmu@mellanox.com>

This commit implements TCP segmentation offload in TAP.
librte_gso library is used to segment large TCP payloads (e.g. packets
of 64K bytes size) into smaller MTU size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

For more details on librte_gso implementation please refer to dpdk
documentation.
The number of newly generated TCP TSO segments is limited to 64.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 153 +++++++++++++++++++++++++++++++++++-------
 drivers/net/tap/rte_eth_tap.h |   4 ++
 mk/rte.app.mk                 |   4 +-
 4 files changed, 135 insertions(+), 28 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..3243365 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -24,7 +24,7 @@ CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
-LDLIBS += -lrte_bus_vdev
+LDLIBS += -lrte_bus_vdev -lrte_gso
 
 CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
 
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index df23c4d..717a2b1 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -17,6 +17,7 @@
 #include <rte_ip.h>
 #include <rte_string_fns.h>
 
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
@@ -408,7 +409,8 @@ tap_tx_offload_get_port_capa(void)
 	return DEV_TX_OFFLOAD_MULTI_SEGS |
 	       DEV_TX_OFFLOAD_IPV4_CKSUM |
 	       DEV_TX_OFFLOAD_UDP_CKSUM |
-	       DEV_TX_OFFLOAD_TCP_CKSUM;
+	       DEV_TX_OFFLOAD_TCP_CKSUM |
+	       DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 static uint64_t
@@ -417,7 +419,8 @@ tap_tx_offload_get_queue_capa(void)
 	return DEV_TX_OFFLOAD_MULTI_SEGS |
 	       DEV_TX_OFFLOAD_IPV4_CKSUM |
 	       DEV_TX_OFFLOAD_UDP_CKSUM |
-	       DEV_TX_OFFLOAD_TCP_CKSUM;
+	       DEV_TX_OFFLOAD_TCP_CKSUM |
+	       DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 static bool
@@ -486,38 +489,26 @@ tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
 	}
 }
 
-/* Callback to handle sending packets from the tap interface
- */
-static uint16_t
-pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+static inline void
+tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
+			struct rte_mbuf **pmbufs,
+			uint16_t *num_packets, unsigned long *num_tx_bytes)
 {
-	struct tx_queue *txq = queue;
-	uint16_t num_tx = 0;
-	unsigned long num_tx_bytes = 0;
-	uint32_t max_size;
 	int i;
 
-	if (unlikely(nb_pkts == 0))
-		return 0;
-
-	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
-	for (i = 0; i < nb_pkts; i++) {
-		struct rte_mbuf *mbuf = bufs[num_tx];
-		struct iovec iovecs[mbuf->nb_segs + 1];
+	for (i = 0; i < num_mbufs; i++) {
+		struct rte_mbuf *mbuf = pmbufs[i];
+		struct iovec iovecs[mbuf->nb_segs + 2];
 		struct tun_pi pi = { .flags = 0, .proto = 0x00 };
 		struct rte_mbuf *seg = mbuf;
 		char m_copy[mbuf->data_len];
 		int n;
 		int j;
-		int k; /* first index in iovecs for copying segments */
+		int k; /* current index in iovecs for copying segments */
 		uint16_t l234_len; /* length of layers 2,3,4 headers */
 		uint16_t seg_len; /* length of first segment */
 		uint16_t nb_segs;
 
-		/* stats.errs will be incremented */
-		if (rte_pktmbuf_pkt_len(mbuf) > max_size)
-			break;
-
 		/*
 		 * TUN and TAP are created with IFF_NO_PI disabled.
 		 * For TUN PMD this mandatory as fields are used by
@@ -581,13 +572,75 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		n = writev(txq->fd, iovecs, j);
 		if (n <= 0)
 			break;
+		(*num_packets)++;
+		(*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
+	}
+}
+
+/* Callback to handle sending packets from the tap interface
+ */
+static uint16_t
+pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct tx_queue *txq = queue;
+	uint16_t num_tx = 0;
+	uint16_t num_packets = 0;
+	unsigned long num_tx_bytes = 0;
+	uint16_t tso_segsz = 0;
+	uint32_t max_size;
+	int i;
+	uint64_t tso;
+	int ret;
+
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
+	for (i = 0; i < nb_pkts; i++) {
+		struct rte_mbuf *mbuf_in = bufs[num_tx];
+		struct rte_mbuf **mbuf;
+		uint16_t num_mbufs;
+
+		tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
+		if (tso) {
+			struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
+			assert(gso_ctx != NULL);
+			/* gso size is calculated without ETHER_CRC_LEN */
+			tso_segsz = mbuf_in->tso_segsz;
+			if (unlikely(tso_segsz == 0) ||
+				tso_segsz > max_size) {
+				txq->stats.errs++;
+				break;
+			}
+			gso_ctx->gso_size = tso_segsz;
+			ret = rte_gso_segment(mbuf_in, /* packet to segment */
+				gso_ctx, /* gso control block */
+				(struct rte_mbuf **)&gso_mbufs, /* out mbufs */
+				RTE_DIM(gso_mbufs)); /* max tso mbufs */
+
+			/* ret contains the number of new created mbufs */
+			if (ret < 0)
+				break;
 
+			mbuf = gso_mbufs;
+			num_mbufs = ret;
+		} else {
+			/* stats.errs will be incremented */
+			if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
+				break;
+
+			mbuf = &mbuf_in;
+			num_mbufs = 1;
+		}
+
+		tap_write_mbufs(txq, num_mbufs, mbuf,
+				&num_packets, &num_tx_bytes);
 		num_tx++;
-		num_tx_bytes += mbuf->pkt_len;
-		rte_pktmbuf_free(mbuf);
+		rte_pktmbuf_free(mbuf_in);
 	}
 
-	txq->stats.opackets += num_tx;
+	txq->stats.opackets += num_packets;
 	txq->stats.errs += nb_pkts - num_tx;
 	txq->stats.obytes += num_tx_bytes;
 
@@ -1027,32 +1080,77 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
 	}
 }
 
+#define TAP_GSO_MBUFS_NUM 64
+#define TAP_GSO_MBUF_SEG_SIZE 128
+
+static int
+tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
+{
+	uint32_t gso_types;
+	char pool_name[64];
+
+	/* Create private mbuf pool with 128 bytes size per mbuf
+	 * use this pool for both direct and indirect mbufs
+	 */
+
+	struct rte_mempool *mp;      /* Mempool for GSO packets */
+	/* initialize GSO context */
+	gso_types = DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+		DEV_TX_OFFLOAD_GRE_TNL_TSO;
+	snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
+	mp = rte_mempool_lookup((const char *)pool_name);
+	if (!mp) {
+		mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
+			0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
+			SOCKET_ID_ANY);
+		if (!mp) {
+			struct pmd_internals *pmd = dev->data->dev_private;
+			RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n",
+				pmd->name, dev->device->name);
+			return -1;
+		}
+	}
+
+	gso_ctx->direct_pool = mp;
+	gso_ctx->indirect_pool = mp;
+	gso_ctx->gso_types = gso_types;
+	gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
+	gso_ctx->flag = 0;
+
+	return 0;
+}
+
 static int
 tap_setup_queue(struct rte_eth_dev *dev,
 		struct pmd_internals *internals,
 		uint16_t qid,
 		int is_rx)
 {
+	int ret;
 	int *fd;
 	int *other_fd;
 	const char *dir;
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rx_queue *rx = &internals->rxq[qid];
 	struct tx_queue *tx = &internals->txq[qid];
+	struct rte_gso_ctx *gso_ctx;
 
 	if (is_rx) {
 		fd = &rx->fd;
 		other_fd = &tx->fd;
 		dir = "rx";
+		gso_ctx = NULL;
 	} else {
 		fd = &tx->fd;
 		other_fd = &rx->fd;
 		dir = "tx";
+		gso_ctx = &tx->gso_ctx;
 	}
 	if (*fd != -1) {
 		/* fd for this queue already exists */
 		RTE_LOG(DEBUG, PMD, "%s: fd %d for %s queue qid %d exists\n",
 			pmd->name, *fd, dir, qid);
+		gso_ctx = NULL;
 	} else if (*other_fd != -1) {
 		/* Only other_fd exists. dup it */
 		*fd = dup(*other_fd);
@@ -1079,6 +1177,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
 
 	tx->mtu = &dev->data->mtu;
 	rx->rxmode = &dev->data->dev_conf.rxmode;
+	if (gso_ctx) {
+		ret = tap_gso_ctx_setup(gso_ctx, dev);
+		if (ret)
+			return -1;
+	}
 
 	return *fd;
 }
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 53a506a..65da5f8 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -15,6 +15,7 @@
 
 #include <rte_ethdev_driver.h>
 #include <rte_ether.h>
+#include <rte_gso.h>
 
 #ifdef IFF_MULTI_QUEUE
 #define RTE_PMD_TAP_MAX_QUEUES	TAP_MAX_QUEUES
@@ -22,6 +23,8 @@
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
 
+#define MAX_GSO_MBUFS 64
+
 struct pkt_stats {
 	uint64_t opackets;              /* Number of output packets */
 	uint64_t ipackets;              /* Number of input packets */
@@ -50,6 +53,7 @@ struct tx_queue {
 	uint16_t *mtu;                  /* Pointer to MTU from dev_data */
 	uint16_t csum:1;                /* Enable checksum offloading */
 	struct pkt_stats stats;         /* Stats for this TX queue */
+	struct rte_gso_ctx gso_ctx;     /* GSO context */
 };
 
 struct pmd_internals {
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 005803a..62cf545 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,8 +66,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -lrte_port
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
 _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
 _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
 _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM)            += -lrte_lpm
 # librte_acl needs --whole-archive because of weak functions
@@ -86,6 +84,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH)           += -lrte_hash
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER)         += -lrte_member
 _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST)          += -lrte_vhost
-- 
2.7.4

  parent reply	other threads:[~2018-04-09 22:33 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-09 21:10 [dpdk-dev] [RFC 0/2] TAP TSO Implementation Ophir Munk
2018-03-09 21:10 ` [dpdk-dev] [RFC 1/2] net/tap: calculate checksum for multi segs packets Ophir Munk
2018-04-09 22:33   ` [dpdk-dev] [PATCH v1 0/2] TAP TSO Ophir Munk
2018-04-09 22:33     ` [dpdk-dev] [PATCH v1 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-04-09 22:33     ` Ophir Munk [this message]
2018-04-22 11:30       ` [dpdk-dev] [PATCH v2 0/2] TAP TSO Ophir Munk
2018-04-22 11:30         ` [dpdk-dev] [PATCH v2 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-05-07 21:54           ` [dpdk-dev] [PATCH v3 0/2] TAP TSO Ophir Munk
2018-05-07 21:54             ` [dpdk-dev] [PATCH v3 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-05-07 21:54             ` [dpdk-dev] [PATCH v3 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-05-31 13:52           ` [dpdk-dev] [PATCH v2 1/2] net/tap: calculate checksums of multi segs packets Ferruh Yigit
2018-05-31 13:54             ` Ferruh Yigit
2018-04-22 11:30         ` [dpdk-dev] [PATCH v2 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-06-12 16:31   ` [dpdk-dev] [PATCH v4 0/2] TAP TSO Ophir Munk
2018-06-12 16:31     ` [dpdk-dev] [PATCH v4 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-06-12 17:17       ` Wiles, Keith
2018-06-12 16:31     ` [dpdk-dev] [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-06-12 17:22       ` Wiles, Keith
2018-06-13 16:04       ` Wiles, Keith
2018-06-14  7:59         ` Ophir Munk
2018-06-14 12:58           ` Wiles, Keith
2018-06-23 23:17       ` [dpdk-dev] [PATCH v5 0/2] TAP TSO Ophir Munk
2018-06-23 23:17         ` [dpdk-dev] [PATCH v5 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-06-24 13:45           ` Wiles, Keith
2018-06-27 13:11             ` Ferruh Yigit
2018-06-23 23:17         ` [dpdk-dev] [PATCH v5 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-03-09 21:10 ` [dpdk-dev] [RFC 2/2] net/tap: implement TAP TSO Ophir Munk
2018-04-09 16:38 ` [dpdk-dev] [RFC 0/2] TAP TSO Implementation Ferruh Yigit
2018-04-09 22:37   ` Ophir Munk
2018-04-10 14:30     ` Ferruh Yigit
2018-04-10 15:31       ` Ophir Munk

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1523313192-18048-3-git-send-email-ophirmu@mellanox.com \
    --to=ophirmu@mellanox.com \
    --cc=dev@dpdk.org \
    --cc=olgas@mellanox.com \
    --cc=pascal.mazon@6wind.com \
    --cc=thomas@monjalon.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).