DPDK patches and discussions
 help / color / mirror / Atom feed
From: Ophir Munk <ophirmu@mellanox.com>
To: "Wiles, Keith" <keith.wiles@intel.com>
Cc: DPDK <dev@dpdk.org>, Pascal Mazon <pascal.mazon@6wind.com>,
	Thomas Monjalon <thomas@monjalon.net>,
	Olga Shern <olgas@mellanox.com>
Subject: Re: [dpdk-dev] [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload)
Date: Thu, 14 Jun 2018 07:59:20 +0000	[thread overview]
Message-ID: <HE1PR0501MB2314A1484171849E502E543FD17D0@HE1PR0501MB2314.eurprd05.prod.outlook.com> (raw)
In-Reply-To: <37D23262-6A5D-4931-A874-1733643C7F95@intel.com>



> -----Original Message-----
> From: Wiles, Keith [mailto:keith.wiles@intel.com]
> Sent: Wednesday, June 13, 2018 7:04 PM
> To: Ophir Munk <ophirmu@mellanox.com>
> Cc: DPDK <dev@dpdk.org>; Pascal Mazon <pascal.mazon@6wind.com>;
> Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> <olgas@mellanox.com>
> Subject: Re: [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload)
> 
> 
> 
> > On Jun 12, 2018, at 11:31 AM, Ophir Munk <ophirmu@mellanox.com>
> wrote:
> >
> > This commit implements TCP segmentation offload in TAP.
> > librte_gso library is used to segment large TCP payloads (e.g. packets
> > of 64K bytes size) into smaller MTU size buffers.
> > By supporting TSO offload capability in software a TAP device can be
> > used as a failsafe sub device and be paired with another PCI device
> > which supports TSO capability in HW.
> >
> > For more details on librte_gso implementation please refer to dpdk
> > documentation.
> > The number of newly generated TCP TSO segments is limited to 64.
> >
> > Reviewed-by: Raslan Darawsheh <rasland@mellanox.com>
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > ---
> > drivers/net/tap/Makefile      |   2 +-
> > drivers/net/tap/rte_eth_tap.c | 159
> +++++++++++++++++++++++++++++++++++-------
> > drivers/net/tap/rte_eth_tap.h |   3 +
> > mk/rte.app.mk                 |   4 +-
> > 4 files changed, 138 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile index
> > ccc5c5f..3243365 100644
> > --- a/drivers/net/tap/Makefile
> > +++ b/drivers/net/tap/Makefile
> > @@ -24,7 +24,7 @@ CFLAGS += -I.
> > CFLAGS += $(WERROR_FLAGS)
> > LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring LDLIBS +=
> > -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash -LDLIBS +=
> > -lrte_bus_vdev
> > +LDLIBS += -lrte_bus_vdev -lrte_gso
> >
> > CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
> >
> > diff --git a/drivers/net/tap/rte_eth_tap.c
> > b/drivers/net/tap/rte_eth_tap.c index c19f053..62b931f 100644
> > --- a/drivers/net/tap/rte_eth_tap.c
> > +++ b/drivers/net/tap/rte_eth_tap.c
> > @@ -17,6 +17,7 @@
> > #include <rte_ip.h>
> > #include <rte_string_fns.h>
> >
> > +#include <assert.h>
> > #include <sys/types.h>
> > #include <sys/stat.h>
> > #include <sys/socket.h>
> > @@ -55,6 +56,9 @@
> > #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
> > #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|"
> ETH_TAP_USR_MAC_FMT
> >
> > +#define TAP_GSO_MBUFS_NUM	128
> > +#define TAP_GSO_MBUF_SEG_SIZE	128
> > +
> > static struct rte_vdev_driver pmd_tap_drv; static struct
> > rte_vdev_driver pmd_tun_drv;
> >
> > @@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void)
> > 	return DEV_TX_OFFLOAD_MULTI_SEGS |
> > 	       DEV_TX_OFFLOAD_IPV4_CKSUM |
> > 	       DEV_TX_OFFLOAD_UDP_CKSUM |
> > -	       DEV_TX_OFFLOAD_TCP_CKSUM;
> > +	       DEV_TX_OFFLOAD_TCP_CKSUM |
> > +	       DEV_TX_OFFLOAD_TCP_TSO;
> > }
> >
> > /* Finalize l4 checksum calculation */ @@ -479,23 +484,15 @@
> > tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
> > 	}
> > }
> >
> > -/* Callback to handle sending packets from the tap interface
> > - */
> > -static uint16_t
> > -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> > +static inline void
> > +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
> > +			struct rte_mbuf **pmbufs, uint16_t l234_hlen,
> > +			uint16_t *num_packets, unsigned long
> *num_tx_bytes)
> > {
> > -	struct tx_queue *txq = queue;
> > -	uint16_t num_tx = 0;
> > -	unsigned long num_tx_bytes = 0;
> > -	uint32_t max_size;
> > 	int i;
> >
> > -	if (unlikely(nb_pkts == 0))
> > -		return 0;
> > -
> > -	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> > -	for (i = 0; i < nb_pkts; i++) {
> > -		struct rte_mbuf *mbuf = bufs[num_tx];
> > +	for (i = 0; i < num_mbufs; i++) {
> > +		struct rte_mbuf *mbuf = pmbufs[i];
> > 		struct iovec iovecs[mbuf->nb_segs + 2];
> > 		struct tun_pi pi = { .flags = 0, .proto = 0x00 };
> > 		struct rte_mbuf *seg = mbuf;
> > @@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> > 		int proto;
> > 		int n;
> > 		int j;
> > -		int k; /* first index in iovecs for copying segments */
> > -		uint16_t l234_hlen; /* length of layers 2,3,4 headers */
> > +		int k; /* current index in iovecs for copying segments */
> > 		uint16_t seg_len; /* length of first segment */
> > 		uint16_t nb_segs;
> > 		uint16_t *l4_cksum; /* l4 checksum (pseudo header +
> payload) */ @@
> > -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs,
> uint16_t nb_pkts)
> > 		uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header
> checksum */
> > 		uint16_t is_cksum = 0; /* in case cksum should be offloaded
> */
> >
> > -		/* stats.errs will be incremented */
> > -		if (rte_pktmbuf_pkt_len(mbuf) > max_size)
> > -			break;
> > -
> > 		l4_cksum = NULL;
> > 		if (txq->type == ETH_TUNTAP_TYPE_TUN) {
> > 			/*
> > @@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> > 			l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf-
> >l4_len;
> > 			if (seg_len < l234_hlen)
> > 				break;
> > -
> > -			/* To change checksums, work on a
> > -			 * copy of l2, l3 l4 headers.
> 
> Adding back in the blank line above would be nice for readability.
> > +			/* To change checksums, work on a * copy of l2, l3
> > +			 * headers + l4 pseudo header
> > 			 */
> > 			rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void
> *),
> > 					l234_hlen);
> > @@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> > 		n = writev(txq->fd, iovecs, j);
> > 		if (n <= 0)
> > 			break;
> > +		(*num_packets)++;
> > +		(*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
> > +	}
> > +}
> > +
> > +/* Callback to handle sending packets from the tap interface  */
> > +static uint16_t pmd_tx_burst(void *queue, struct rte_mbuf **bufs,
> > +uint16_t nb_pkts) {
> > +	struct tx_queue *txq = queue;
> > +	uint16_t num_tx = 0;
> > +	uint16_t num_packets = 0;
> > +	unsigned long num_tx_bytes = 0;
> > +	uint16_t tso_segsz = 0;
> > +	uint16_t hdrs_len;
> > +	uint32_t max_size;
> > +	int i;
> > +	uint64_t tso;
> > +	int ret;
> >
> > +	if (unlikely(nb_pkts == 0))
> > +		return 0;
> > +
> > +	struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
> > +	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> > +	for (i = 0; i < nb_pkts; i++) {
> > +		struct rte_mbuf *mbuf_in = bufs[num_tx];
> > +		struct rte_mbuf **mbuf;
> > +		uint16_t num_mbufs;
> > +
> > +		tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
> > +		if (tso) {
> > +			struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
> 
> Missing blank line here, this one is not optional.
> > +			assert(gso_ctx != NULL);
> 
> Blank line would be nice.
> > +			/* TCP segmentation implies TCP checksum offload
> */
> > +			mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
> 
> Blank line would be nice.
> > +			/* gso size is calculated without ETHER_CRC_LEN */
> > +			hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
> > +					mbuf_in->l4_len;
> > +			tso_segsz = mbuf_in->tso_segsz + hdrs_len;
> > +			if (unlikely(tso_segsz == hdrs_len) ||
> > +				tso_segsz > *txq->mtu) {
> > +				txq->stats.errs++;
> > +				break;
> > +			}
> > +			gso_ctx->gso_size = tso_segsz;
> > +			ret = rte_gso_segment(mbuf_in, /* packet to
> segment */
> > +				gso_ctx, /* gso control block */
> > +				(struct rte_mbuf **)&gso_mbufs, /* out
> mbufs */
> > +				RTE_DIM(gso_mbufs)); /* max tso mbufs */
> > +
> > +			/* ret contains the number of new created mbufs */
> > +			if (ret < 0)
> > +				break;
> > +
> > +			mbuf = gso_mbufs;
> > +			num_mbufs = ret;
> > +		} else {
> > +			/* stats.errs will be incremented */
> > +			if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
> > +				break;
> > +
> > +			mbuf = &mbuf_in;
> > +			num_mbufs = 1;
> > +		}
> > +
> > +		tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
> > +				&num_packets, &num_tx_bytes);
> > 		num_tx++;
> > -		num_tx_bytes += mbuf->pkt_len;
> > -		rte_pktmbuf_free(mbuf);
> > +		rte_pktmbuf_free(mbuf_in);
> > 	}
> >
> > -	txq->stats.opackets += num_tx;
> > +	txq->stats.opackets += num_packets;
> > 	txq->stats.errs += nb_pkts - num_tx;
> > 	txq->stats.obytes += num_tx_bytes;
> >
> > @@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct
> > ether_addr *mac_addr) }
> >
> > static int
> > +tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev
> > +*dev) {
> > +	uint32_t gso_types;
> > +	char pool_name[64];
> > +
> > +	/*
> > +	 * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
> > +	 * size per mbuf use this pool for both direct and indirect mbufs
> > +	 */
> > +
> > +	struct rte_mempool *mp;      /* Mempool for GSO packets */
> > +	/* initialize GSO context */
> > +	gso_types = DEV_TX_OFFLOAD_TCP_TSO;
> > +	snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device-
> >name);
> > +	mp = rte_mempool_lookup((const char *)pool_name);
> > +	if (!mp) {
> > +		mp = rte_pktmbuf_pool_create(pool_name,
> TAP_GSO_MBUFS_NUM,
> > +			0, 0, RTE_PKTMBUF_HEADROOM +
> TAP_GSO_MBUF_SEG_SIZE,
> > +			SOCKET_ID_ANY);
> 
> You have setup the mempool with no cache size, which means you have to
> take a lock for each allocate. This could changed to have a small cache per
> lcore say 8, but the total number of mbufs needs to be large enough to not
> allow starvation for a lcore.  total_mbufs = (max_num_ports * cache_size) +
> some_extra mbufs;
> 

I will set cache_size as 4. 
The total_mbufs should be mbufs_per_core(128) * cache_size(4) where the max_num_ports 
is already taken into consideration in mbufs_per_core. For example, for a TCP packet of 1024 bytes and TSO max seg size of 256 bytes GSO will allocate 5 mbufs (one direct and four indirect) regardless of the number of ports.

> > +		if (!mp) {
> > +			struct pmd_internals *pmd = dev->data-
> >dev_private;
> > +			RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf
> pool for device %s\n",
> > +				pmd->name, dev->device->name);
> > +			return -1;
> > +		}
> > +	}
> > +
> > +	gso_ctx->direct_pool = mp;
> > +	gso_ctx->indirect_pool = mp;
> > +	gso_ctx->gso_types = gso_types;
> > +	gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
> > +	gso_ctx->flag = 0;
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > tap_setup_queue(struct rte_eth_dev *dev,
> > 		struct pmd_internals *internals,
> > 		uint16_t qid,
> > 		int is_rx)
> > {
> > +	int ret;
> > 	int *fd;
> > 	int *other_fd;
> > 	const char *dir;
> > 	struct pmd_internals *pmd = dev->data->dev_private;
> > 	struct rx_queue *rx = &internals->rxq[qid];
> > 	struct tx_queue *tx = &internals->txq[qid];
> > +	struct rte_gso_ctx *gso_ctx;
> >
> > 	if (is_rx) {
> > 		fd = &rx->fd;
> > 		other_fd = &tx->fd;
> > 		dir = "rx";
> > +		gso_ctx = NULL;
> > 	} else {
> > 		fd = &tx->fd;
> > 		other_fd = &rx->fd;
> > 		dir = "tx";
> > +		gso_ctx = &tx->gso_ctx;
> > 	}
> > 	if (*fd != -1) {
> > 		/* fd for this queue already exists */
> > 		TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
> > 			pmd->name, *fd, dir, qid);
> > +		gso_ctx = NULL;
> > 	} else if (*other_fd != -1) {
> > 		/* Only other_fd exists. dup it */
> > 		*fd = dup(*other_fd);
> > @@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
> >
> > 	tx->mtu = &dev->data->mtu;
> > 	rx->rxmode = &dev->data->dev_conf.rxmode;
> > +	if (gso_ctx) {
> > +		ret = tap_gso_ctx_setup(gso_ctx, dev);
> > +		if (ret)
> > +			return -1;
> > +	}
> >
> > 	tx->type = pmd->type;
> >
> > diff --git a/drivers/net/tap/rte_eth_tap.h
> > b/drivers/net/tap/rte_eth_tap.h index 7b21d0d..44e2773 100644
> > --- a/drivers/net/tap/rte_eth_tap.h
> > +++ b/drivers/net/tap/rte_eth_tap.h
> > @@ -15,6 +15,7 @@
> >
> > #include <rte_ethdev_driver.h>
> > #include <rte_ether.h>
> > +#include <rte_gso.h>
> > #include "tap_log.h"
> >
> > #ifdef IFF_MULTI_QUEUE
> > @@ -22,6 +23,7 @@
> > #else
> > #define RTE_PMD_TAP_MAX_QUEUES	1
> > #endif
> > +#define MAX_GSO_MBUFS 64
> >
> > enum rte_tuntap_type {
> > 	ETH_TUNTAP_TYPE_UNKNOWN,
> > @@ -59,6 +61,7 @@ struct tx_queue {
> > 	uint16_t *mtu;                  /* Pointer to MTU from dev_data */
> > 	uint16_t csum:1;                /* Enable checksum offloading */
> > 	struct pkt_stats stats;         /* Stats for this TX queue */
> > +	struct rte_gso_ctx gso_ctx;     /* GSO context */
> > };
> >
> > struct pmd_internals {
> > diff --git a/mk/rte.app.mk b/mk/rte.app.mk index 1e32c83..e2ee879
> > 100644
> > --- a/mk/rte.app.mk
> > +++ b/mk/rte.app.mk
> > @@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -
> lrte_port
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
> > -_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
> > -_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM)            += -lrte_lpm
> > # librte_acl needs --whole-archive because of weak functions @@ -61,6
> > +59,8 @@ endif _LDLIBS-y += --whole-archive
> >
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
> > +_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
> > +_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH)           += -lrte_hash
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER)         += -lrte_member
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST)          += -lrte_vhost
> > --
> > 2.7.4
> >
> 
> Regards,
> Keith

  reply	other threads:[~2018-06-14  7:59 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-09 21:10 [dpdk-dev] [RFC 0/2] TAP TSO Implementation Ophir Munk
2018-03-09 21:10 ` [dpdk-dev] [RFC 1/2] net/tap: calculate checksum for multi segs packets Ophir Munk
2018-04-09 22:33   ` [dpdk-dev] [PATCH v1 0/2] TAP TSO Ophir Munk
2018-04-09 22:33     ` [dpdk-dev] [PATCH v1 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-04-09 22:33     ` [dpdk-dev] [PATCH v1 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-04-22 11:30       ` [dpdk-dev] [PATCH v2 0/2] TAP TSO Ophir Munk
2018-04-22 11:30         ` [dpdk-dev] [PATCH v2 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-05-07 21:54           ` [dpdk-dev] [PATCH v3 0/2] TAP TSO Ophir Munk
2018-05-07 21:54             ` [dpdk-dev] [PATCH v3 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-05-07 21:54             ` [dpdk-dev] [PATCH v3 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-05-31 13:52           ` [dpdk-dev] [PATCH v2 1/2] net/tap: calculate checksums of multi segs packets Ferruh Yigit
2018-05-31 13:54             ` Ferruh Yigit
2018-04-22 11:30         ` [dpdk-dev] [PATCH v2 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-06-12 16:31   ` [dpdk-dev] [PATCH v4 0/2] TAP TSO Ophir Munk
2018-06-12 16:31     ` [dpdk-dev] [PATCH v4 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-06-12 17:17       ` Wiles, Keith
2018-06-12 16:31     ` [dpdk-dev] [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-06-12 17:22       ` Wiles, Keith
2018-06-13 16:04       ` Wiles, Keith
2018-06-14  7:59         ` Ophir Munk [this message]
2018-06-14 12:58           ` Wiles, Keith
2018-06-23 23:17       ` [dpdk-dev] [PATCH v5 0/2] TAP TSO Ophir Munk
2018-06-23 23:17         ` [dpdk-dev] [PATCH v5 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-06-24 13:45           ` Wiles, Keith
2018-06-27 13:11             ` Ferruh Yigit
2018-06-23 23:17         ` [dpdk-dev] [PATCH v5 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-03-09 21:10 ` [dpdk-dev] [RFC 2/2] net/tap: implement TAP TSO Ophir Munk
2018-04-09 16:38 ` [dpdk-dev] [RFC 0/2] TAP TSO Implementation Ferruh Yigit
2018-04-09 22:37   ` Ophir Munk
2018-04-10 14:30     ` Ferruh Yigit
2018-04-10 15:31       ` Ophir Munk

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=HE1PR0501MB2314A1484171849E502E543FD17D0@HE1PR0501MB2314.eurprd05.prod.outlook.com \
    --to=ophirmu@mellanox.com \
    --cc=dev@dpdk.org \
    --cc=keith.wiles@intel.com \
    --cc=olgas@mellanox.com \
    --cc=pascal.mazon@6wind.com \
    --cc=thomas@monjalon.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).