From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga18.intel.com (mga18.intel.com [134.134.136.126]) by dpdk.org (Postfix) with ESMTP id 9FE401E2A3 for ; Wed, 13 Jun 2018 18:04:27 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga006.fm.intel.com ([10.253.24.20]) by orsmga106.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 13 Jun 2018 09:04:24 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.51,219,1526367600"; d="scan'208";a="237145915" Received: from fmsmsx103.amr.corp.intel.com ([10.18.124.201]) by fmsmga006.fm.intel.com with ESMTP; 13 Jun 2018 09:04:24 -0700 Received: from fmsmsx122.amr.corp.intel.com (10.18.125.37) by FMSMSX103.amr.corp.intel.com (10.18.124.201) with Microsoft SMTP Server (TLS) id 14.3.319.2; Wed, 13 Jun 2018 09:04:24 -0700 Received: from fmsmsx117.amr.corp.intel.com ([169.254.3.220]) by fmsmsx122.amr.corp.intel.com ([169.254.5.149]) with mapi id 14.03.0319.002; Wed, 13 Jun 2018 09:04:23 -0700 From: "Wiles, Keith" To: Ophir Munk CC: DPDK , Pascal Mazon , "Thomas Monjalon" , Olga Shern Thread-Topic: [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload) Thread-Index: AQHUAmr3BG+m+4IWp0SAx5DEpgQx8qRe0OUA Date: Wed, 13 Jun 2018 16:04:22 +0000 Message-ID: <37D23262-6A5D-4931-A874-1733643C7F95@intel.com> References: <1520629826-23055-2-git-send-email-ophirmu@mellanox.com> <1528821108-12405-1-git-send-email-ophirmu@mellanox.com> <1528821108-12405-3-git-send-email-ophirmu@mellanox.com> In-Reply-To: <1528821108-12405-3-git-send-email-ophirmu@mellanox.com> Accept-Language: en-US Content-Language: en-US X-MS-Has-Attach: X-MS-TNEF-Correlator: x-originating-ip: [10.254.116.191] Content-Type: text/plain; charset="us-ascii" Content-ID: <7800E02B44A4F940837EDBC5E169239B@intel.com> Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 Subject: Re: [dpdk-dev] [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload) X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 13 Jun 2018 16:04:29 -0000 > On Jun 12, 2018, at 11:31 AM, Ophir Munk wrote: >=20 > This commit implements TCP segmentation offload in TAP. > librte_gso library is used to segment large TCP payloads (e.g. packets > of 64K bytes size) into smaller MTU size buffers. > By supporting TSO offload capability in software a TAP device can be used > as a failsafe sub device and be paired with another PCI device which > supports TSO capability in HW. >=20 > For more details on librte_gso implementation please refer to dpdk > documentation. > The number of newly generated TCP TSO segments is limited to 64. >=20 > Reviewed-by: Raslan Darawsheh > Signed-off-by: Ophir Munk > --- > drivers/net/tap/Makefile | 2 +- > drivers/net/tap/rte_eth_tap.c | 159 +++++++++++++++++++++++++++++++++++--= ----- > drivers/net/tap/rte_eth_tap.h | 3 + > mk/rte.app.mk | 4 +- > 4 files changed, 138 insertions(+), 30 deletions(-) >=20 > diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile > index ccc5c5f..3243365 100644 > --- a/drivers/net/tap/Makefile > +++ b/drivers/net/tap/Makefile > @@ -24,7 +24,7 @@ CFLAGS +=3D -I. > CFLAGS +=3D $(WERROR_FLAGS) > LDLIBS +=3D -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring > LDLIBS +=3D -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash > -LDLIBS +=3D -lrte_bus_vdev > +LDLIBS +=3D -lrte_bus_vdev -lrte_gso >=20 > CFLAGS +=3D -DTAP_MAX_QUEUES=3D$(TAP_MAX_QUEUES) >=20 > diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.= c > index c19f053..62b931f 100644 > --- a/drivers/net/tap/rte_eth_tap.c > +++ b/drivers/net/tap/rte_eth_tap.c > @@ -17,6 +17,7 @@ > #include > #include >=20 > +#include > #include > #include > #include > @@ -55,6 +56,9 @@ > #define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef" > #define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT >=20 > +#define TAP_GSO_MBUFS_NUM 128 > +#define TAP_GSO_MBUF_SEG_SIZE 128 > + > static struct rte_vdev_driver pmd_tap_drv; > static struct rte_vdev_driver pmd_tun_drv; >=20 > @@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void) > return DEV_TX_OFFLOAD_MULTI_SEGS | > DEV_TX_OFFLOAD_IPV4_CKSUM | > DEV_TX_OFFLOAD_UDP_CKSUM | > - DEV_TX_OFFLOAD_TCP_CKSUM; > + DEV_TX_OFFLOAD_TCP_CKSUM | > + DEV_TX_OFFLOAD_TCP_TSO; > } >=20 > /* Finalize l4 checksum calculation */ > @@ -479,23 +484,15 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, un= signed int l2_len, > } > } >=20 > -/* Callback to handle sending packets from the tap interface > - */ > -static uint16_t > -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) > +static inline void > +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, > + struct rte_mbuf **pmbufs, uint16_t l234_hlen, > + uint16_t *num_packets, unsigned long *num_tx_bytes) > { > - struct tx_queue *txq =3D queue; > - uint16_t num_tx =3D 0; > - unsigned long num_tx_bytes =3D 0; > - uint32_t max_size; > int i; >=20 > - if (unlikely(nb_pkts =3D=3D 0)) > - return 0; > - > - max_size =3D *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4); > - for (i =3D 0; i < nb_pkts; i++) { > - struct rte_mbuf *mbuf =3D bufs[num_tx]; > + for (i =3D 0; i < num_mbufs; i++) { > + struct rte_mbuf *mbuf =3D pmbufs[i]; > struct iovec iovecs[mbuf->nb_segs + 2]; > struct tun_pi pi =3D { .flags =3D 0, .proto =3D 0x00 }; > struct rte_mbuf *seg =3D mbuf; > @@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uin= t16_t nb_pkts) > int proto; > int n; > int j; > - int k; /* first index in iovecs for copying segments */ > - uint16_t l234_hlen; /* length of layers 2,3,4 headers */ > + int k; /* current index in iovecs for copying segments */ > uint16_t seg_len; /* length of first segment */ > uint16_t nb_segs; > uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */ > @@ -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, ui= nt16_t nb_pkts) > uint16_t l4_phdr_cksum =3D 0; /* TCP/UDP pseudo header checksum */ > uint16_t is_cksum =3D 0; /* in case cksum should be offloaded */ >=20 > - /* stats.errs will be incremented */ > - if (rte_pktmbuf_pkt_len(mbuf) > max_size) > - break; > - > l4_cksum =3D NULL; > if (txq->type =3D=3D ETH_TUNTAP_TYPE_TUN) { > /* > @@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uin= t16_t nb_pkts) > l234_hlen =3D mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; > if (seg_len < l234_hlen) > break; > - > - /* To change checksums, work on a > - * copy of l2, l3 l4 headers. Adding back in the blank line above would be nice for readability. > + /* To change checksums, work on a * copy of l2, l3 > + * headers + l4 pseudo header > */ > rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *), > l234_hlen); > @@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, u= int16_t nb_pkts) > n =3D writev(txq->fd, iovecs, j); > if (n <=3D 0) > break; > + (*num_packets)++; > + (*num_tx_bytes) +=3D rte_pktmbuf_pkt_len(mbuf); > + } > +} > + > +/* Callback to handle sending packets from the tap interface > + */ > +static uint16_t > +pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) > +{ > + struct tx_queue *txq =3D queue; > + uint16_t num_tx =3D 0; > + uint16_t num_packets =3D 0; > + unsigned long num_tx_bytes =3D 0; > + uint16_t tso_segsz =3D 0; > + uint16_t hdrs_len; > + uint32_t max_size; > + int i; > + uint64_t tso; > + int ret; >=20 > + if (unlikely(nb_pkts =3D=3D 0)) > + return 0; > + > + struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS]; > + max_size =3D *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4); > + for (i =3D 0; i < nb_pkts; i++) { > + struct rte_mbuf *mbuf_in =3D bufs[num_tx]; > + struct rte_mbuf **mbuf; > + uint16_t num_mbufs; > + > + tso =3D mbuf_in->ol_flags & PKT_TX_TCP_SEG; > + if (tso) { > + struct rte_gso_ctx *gso_ctx =3D &txq->gso_ctx; Missing blank line here, this one is not optional. > + assert(gso_ctx !=3D NULL); Blank line would be nice. > + /* TCP segmentation implies TCP checksum offload */ > + mbuf_in->ol_flags |=3D PKT_TX_TCP_CKSUM; Blank line would be nice. > + /* gso size is calculated without ETHER_CRC_LEN */ > + hdrs_len =3D mbuf_in->l2_len + mbuf_in->l3_len + > + mbuf_in->l4_len; > + tso_segsz =3D mbuf_in->tso_segsz + hdrs_len; > + if (unlikely(tso_segsz =3D=3D hdrs_len) || > + tso_segsz > *txq->mtu) { > + txq->stats.errs++; > + break; > + } > + gso_ctx->gso_size =3D tso_segsz; > + ret =3D rte_gso_segment(mbuf_in, /* packet to segment */ > + gso_ctx, /* gso control block */ > + (struct rte_mbuf **)&gso_mbufs, /* out mbufs */ > + RTE_DIM(gso_mbufs)); /* max tso mbufs */ > + > + /* ret contains the number of new created mbufs */ > + if (ret < 0) > + break; > + > + mbuf =3D gso_mbufs; > + num_mbufs =3D ret; > + } else { > + /* stats.errs will be incremented */ > + if (rte_pktmbuf_pkt_len(mbuf_in) > max_size) > + break; > + > + mbuf =3D &mbuf_in; > + num_mbufs =3D 1; > + } > + > + tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len, > + &num_packets, &num_tx_bytes); > num_tx++; > - num_tx_bytes +=3D mbuf->pkt_len; > - rte_pktmbuf_free(mbuf); > + rte_pktmbuf_free(mbuf_in); > } >=20 > - txq->stats.opackets +=3D num_tx; > + txq->stats.opackets +=3D num_packets; > txq->stats.errs +=3D nb_pkts - num_tx; > txq->stats.obytes +=3D num_tx_bytes; >=20 > @@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether= _addr *mac_addr) > } >=20 > static int > +tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev) > +{ > + uint32_t gso_types; > + char pool_name[64]; > + > + /* > + * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes > + * size per mbuf use this pool for both direct and indirect mbufs > + */ > + > + struct rte_mempool *mp; /* Mempool for GSO packets */ > + /* initialize GSO context */ > + gso_types =3D DEV_TX_OFFLOAD_TCP_TSO; > + snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name); > + mp =3D rte_mempool_lookup((const char *)pool_name); > + if (!mp) { > + mp =3D rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM, > + 0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE, > + SOCKET_ID_ANY); You have setup the mempool with no cache size, which means you have to take= a lock for each allocate. This could changed to have a small cache per lco= re say 8, but the total number of mbufs needs to be large enough to not all= ow starvation for a lcore. total_mbufs =3D (max_num_ports * cache_size) + = some_extra mbufs; > + if (!mp) { > + struct pmd_internals *pmd =3D dev->data->dev_private; > + RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n", > + pmd->name, dev->device->name); > + return -1; > + } > + } > + > + gso_ctx->direct_pool =3D mp; > + gso_ctx->indirect_pool =3D mp; > + gso_ctx->gso_types =3D gso_types; > + gso_ctx->gso_size =3D 0; /* gso_size is set in tx_burst() per packet */ > + gso_ctx->flag =3D 0; > + > + return 0; > +} > + > +static int > tap_setup_queue(struct rte_eth_dev *dev, > struct pmd_internals *internals, > uint16_t qid, > int is_rx) > { > + int ret; > int *fd; > int *other_fd; > const char *dir; > struct pmd_internals *pmd =3D dev->data->dev_private; > struct rx_queue *rx =3D &internals->rxq[qid]; > struct tx_queue *tx =3D &internals->txq[qid]; > + struct rte_gso_ctx *gso_ctx; >=20 > if (is_rx) { > fd =3D &rx->fd; > other_fd =3D &tx->fd; > dir =3D "rx"; > + gso_ctx =3D NULL; > } else { > fd =3D &tx->fd; > other_fd =3D &rx->fd; > dir =3D "tx"; > + gso_ctx =3D &tx->gso_ctx; > } > if (*fd !=3D -1) { > /* fd for this queue already exists */ > TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists", > pmd->name, *fd, dir, qid); > + gso_ctx =3D NULL; > } else if (*other_fd !=3D -1) { > /* Only other_fd exists. dup it */ > *fd =3D dup(*other_fd); > @@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev, >=20 > tx->mtu =3D &dev->data->mtu; > rx->rxmode =3D &dev->data->dev_conf.rxmode; > + if (gso_ctx) { > + ret =3D tap_gso_ctx_setup(gso_ctx, dev); > + if (ret) > + return -1; > + } >=20 > tx->type =3D pmd->type; >=20 > diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.= h > index 7b21d0d..44e2773 100644 > --- a/drivers/net/tap/rte_eth_tap.h > +++ b/drivers/net/tap/rte_eth_tap.h > @@ -15,6 +15,7 @@ >=20 > #include > #include > +#include > #include "tap_log.h" >=20 > #ifdef IFF_MULTI_QUEUE > @@ -22,6 +23,7 @@ > #else > #define RTE_PMD_TAP_MAX_QUEUES 1 > #endif > +#define MAX_GSO_MBUFS 64 >=20 > enum rte_tuntap_type { > ETH_TUNTAP_TYPE_UNKNOWN, > @@ -59,6 +61,7 @@ struct tx_queue { > uint16_t *mtu; /* Pointer to MTU from dev_data */ > uint16_t csum:1; /* Enable checksum offloading */ > struct pkt_stats stats; /* Stats for this TX queue */ > + struct rte_gso_ctx gso_ctx; /* GSO context */ > }; >=20 > struct pmd_internals { > diff --git a/mk/rte.app.mk b/mk/rte.app.mk > index 1e32c83..e2ee879 100644 > --- a/mk/rte.app.mk > +++ b/mk/rte.app.mk > @@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT) +=3D -lrte_= port > _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) +=3D -lrte_pdump > _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) +=3D -lrte_distributor > _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) +=3D -lrte_ip_frag > -_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) +=3D -lrte_gro > -_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) +=3D -lrte_gso > _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) +=3D -lrte_meter > _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) +=3D -lrte_lpm > # librte_acl needs --whole-archive because of weak functions > @@ -61,6 +59,8 @@ endif > _LDLIBS-y +=3D --whole-archive >=20 > _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE) +=3D -lrte_cfgfile > +_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) +=3D -lrte_gro > +_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) +=3D -lrte_gso > _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH) +=3D -lrte_hash > _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER) +=3D -lrte_member > _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) +=3D -lrte_vhost > --=20 > 2.7.4 >=20 Regards, Keith