From: Olivier Matz <olivier.matz@6wind.com>
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH v2 13/17] ixgbe: support TCP segmentation offload
Date: Mon, 19 May 2014 15:56:25 +0200 [thread overview]
Message-ID: <1400507789-18453-14-git-send-email-olivier.matz@6wind.com> (raw)
In-Reply-To: <1400507789-18453-1-git-send-email-olivier.matz@6wind.com>
Implement TSO (TCP segmentation offload) in ixgbe driver.
The driver is now able to use PKT_TX_TCP_SEG mbuf flag and m->hw_offload
to configure the hardware support of TCP segmentation.
In the patch, the tx_desc_cksum_flags_to_olinfo() and
tx_desc_ol_flags_to_cmdtype() functions have been reworked to make them
clearer. This should not impact performance as gcc (version 4.8 in my
case) is smart enough to convert the tests into a code that does not
contain any branch instruction.
validation
==========
platform:
Tester (linux) <----> DUT (DPDK)
Run testpmd on DUT:
cd dpdk.org/
make install T=x86_64-default-linuxapp-gcc
cd x86_64-default-linuxapp-gcc/
modprobe uio
insmod kmod/igb_uio.ko
python ../tools/igb_uio_bind.py -b igb_uio 0000:02:00.0
echo 0 > /proc/sys/kernel/randomize_va_space
echo 1000 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
echo 1000 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages
mount -t hugetlbfs none /mnt/huge
./app/testpmd -c 0x55 -n 4 -m 800 -- -i --port-topology=chained
Disable all offload feature on Tester, and start capture:
ethtool -K ixgbe0 rx off tx off tso off gso off gro off lro off
ip l set ixgbe0 up
tcpdump -n -e -i ixgbe0 -s 0 -w /tmp/cap
We use the following scapy script for testing:
def test():
############### IPv4
# checksum TCP
p=Ether()/IP(src=RandIP(), dst=RandIP())/TCP(flags=0x10)/Raw(RandString(50))
sendp(p, iface="ixgbe0", count=5)
# checksum UDP
p=Ether()/IP(src=RandIP(), dst=RandIP())/UDP()/Raw(RandString(50))
sendp(p, iface="ixgbe0", count=5)
# bad IP checksum
p=Ether()/IP(src=RandIP(), dst=RandIP(), chksum=0x1234)/TCP(flags=0x10)/Raw(RandString(50))
sendp(p, iface="ixgbe0", count=5)
# bad TCP checksum
p=Ether()/IP(src=RandIP(), dst=RandIP())/TCP(flags=0x10, chksum=0x1234)/Raw(RandString(50))
sendp(p, iface="ixgbe0", count=5)
# large packet
p=Ether()/IP(src=RandIP(), dst=RandIP())/TCP(flags=0x10)/Raw(RandString(1400))
sendp(p, iface="ixgbe0", count=5)
############### IPv6v6
# checksum TCP
p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/TCP(flags=0x10)/Raw(RandString(50))
sendp(p, iface="ixgbe0", count=5)
# checksum UDP
p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/UDP()/Raw(RandString(50))
sendp(p, iface="ixgbe0", count=5)
# bad TCP checksum
p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/TCP(flags=0x10, chksum=0x1234)/Raw(RandString(50))
sendp(p, iface="ixgbe0", count=5)
# large packet
p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/TCP(flags=0x10)/Raw(RandString(1400))
sendp(p, iface="ixgbe0", count=5)
Without hw cksum
----------------
On DUT:
# disable hw cksum (use sw) in csumonly test, disable tso
stop
set fwd csum
tx_checksum set 0x0 0
tso set 0 0
start
On tester:
>>> test()
Then check the capture file.
With hw cksum
-------------
On DUT:
# enable hw cksum in csumonly test, disable tso
stop
set fwd csum
tx_checksum set 0xf 0
tso set 0 0
start
On tester:
>>> test()
Then check the capture file.
With TSO
--------
On DUT:
set fwd csum
tx_checksum set 0xf 0
tso set 800 0
start
On tester:
>>> test()
Then check the capture file.
Performance tests
=================
Performance tests have been done on v1 of this patch series.
See http://dpdk.org/ml/archives/dev/2014-May/002516.html
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 3 +-
lib/librte_pmd_ixgbe/ixgbe_rxtx.c | 169 +++++++++++++++++++++++++++---------
2 files changed, 129 insertions(+), 43 deletions(-)
diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index e78c208..2d5e524 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -1751,7 +1751,8 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
DEV_TX_OFFLOAD_IPV4_CKSUM |
DEV_TX_OFFLOAD_UDP_CKSUM |
DEV_TX_OFFLOAD_TCP_CKSUM |
- DEV_TX_OFFLOAD_SCTP_CKSUM;
+ DEV_TX_OFFLOAD_SCTP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_TSO;
}
/* return 0 means link status changed, -1 means not changed */
diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
index e1eb59d..541dd58 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
@@ -347,13 +347,59 @@ ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
return nb_tx;
}
+/* When doing TSO, the IP length must not be included in the pseudo
+ * header checksum of the packet given to the hardware */
+static inline void
+ixgbe_fix_tcp_phdr_cksum(struct rte_mbuf *m)
+{
+ char *data;
+ uint16_t *cksum_ptr;
+ uint16_t prev_cksum;
+ uint16_t new_cksum;
+ uint16_t ip_len, ip_paylen;
+ uint32_t tmp;
+ uint8_t ip_version;
+
+ /* get phdr cksum at offset 16 of TCP header */
+ data = rte_pktmbuf_mtod(m, char *);
+ cksum_ptr = (uint16_t *)(data + m->hw_offload.l2_len +
+ m->hw_offload.l3_len + 16);
+ prev_cksum = *cksum_ptr;
+
+ /* get ip_version */
+ ip_version = (*(uint8_t *)(data + m->hw_offload.l2_len)) >> 4;
+
+ /* get ip_len at offset 2 of IP header or offset 4 of IPv6 header */
+ if (ip_version == 4) {
+ /* override ip cksum to 0 */
+ data[m->hw_offload.l2_len + 10] = 0;
+ data[m->hw_offload.l2_len + 11] = 0;
+
+ ip_len = *(uint16_t *)(data + m->hw_offload.l2_len + 2);
+ ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
+ m->hw_offload.l3_len);
+ } else {
+ ip_paylen = *(uint16_t *)(data + m->hw_offload.l2_len + 4);
+ }
+
+ /* calculate the new phdr checksum that doesn't include ip_paylen */
+ tmp = prev_cksum ^ 0xffff;
+ if (tmp < ip_paylen)
+ tmp += 0xffff;
+ tmp -= ip_paylen;
+ new_cksum = tmp;
+
+ /* replace it in the packet */
+ *cksum_ptr = new_cksum;
+}
+
static inline void
ixgbe_set_xmit_ctx(struct igb_tx_queue* txq,
volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
uint32_t ol_flags, union rte_hw_offload hw_offload)
{
uint32_t type_tucmd_mlhl;
- uint32_t mss_l4len_idx;
+ uint32_t mss_l4len_idx = 0;
uint32_t ctx_idx;
uint32_t vlan_macip_lens;
union rte_hw_offload offload_mask;
@@ -362,44 +408,61 @@ ixgbe_set_xmit_ctx(struct igb_tx_queue* txq,
offload_mask.u64 = 0;
type_tucmd_mlhl = 0;
+ /* Specify which HW CTX to upload. */
+ mss_l4len_idx |= (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT);
+
if (ol_flags & PKT_TX_VLAN_PKT) {
offload_mask.vlan_tci = 0xffff;
}
- if (ol_flags & PKT_TX_IP_CKSUM) {
- type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4;
+ /* check if TCP segmentation required for this packet */
+ if (ol_flags & PKT_TX_TCP_SEG) {
+ /* implies IP cksum and TCP cksum */
+ type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4 |
+ IXGBE_ADVTXD_TUCMD_L4T_TCP |
+ IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;;
+
offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
- }
+ offload_mask.l4_len = HW_OFFLOAD_L4_LEN_MASK;
+ offload_mask.mss = 0xffff;
+ mss_l4len_idx |= hw_offload.mss << IXGBE_ADVTXD_MSS_SHIFT;
+ mss_l4len_idx |= hw_offload.l4_len << IXGBE_ADVTXD_L4LEN_SHIFT;
+ } else { /* no TSO, check if hardware checksum is needed */
+ if (ol_flags & PKT_TX_IP_CKSUM) {
+ type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4;
+ offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+ offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+ }
- /* Specify which HW CTX to upload. */
- mss_l4len_idx = (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT);
- switch (ol_flags & PKT_TX_L4_MASK) {
- case PKT_TX_UDP_CKSUM:
- type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP |
+ switch (ol_flags & PKT_TX_L4_MASK) {
+ case PKT_TX_UDP_CKSUM:
+ type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
- mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
- offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
- offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
- break;
- case PKT_TX_TCP_CKSUM:
- type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP |
+ mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
+ offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+ offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+ break;
+ case PKT_TX_TCP_CKSUM:
+ type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
- mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
- offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
- offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
- break;
- case PKT_TX_SCTP_CKSUM:
- type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP |
+ mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
+ offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+ offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+ offload_mask.l4_len = HW_OFFLOAD_L4_LEN_MASK;
+ break;
+ case PKT_TX_SCTP_CKSUM:
+ type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
- mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
- offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
- offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
- break;
- default:
- type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV |
+ mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
+ offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+ offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+ break;
+ default:
+ type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV |
IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
- break;
+ break;
+ }
}
txq->ctx_cache[ctx_idx].flags = ol_flags;
@@ -446,20 +509,25 @@ what_advctx_update(struct igb_tx_queue *txq, uint32_t flags,
static inline uint32_t
tx_desc_cksum_flags_to_olinfo(uint32_t ol_flags)
{
- static const uint32_t l4_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_TXSM};
- static const uint32_t l3_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_IXSM};
- uint32_t tmp;
-
- tmp = l4_olinfo[(ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM];
- tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
+ uint32_t tmp = 0;
+ if ((ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM)
+ tmp |= IXGBE_ADVTXD_POPTS_TXSM;
+ if (ol_flags & PKT_TX_IP_CKSUM)
+ tmp |= IXGBE_ADVTXD_POPTS_IXSM;
+ if (ol_flags & PKT_TX_TCP_SEG)
+ tmp |= IXGBE_ADVTXD_POPTS_TXSM | IXGBE_ADVTXD_POPTS_IXSM;
return tmp;
}
static inline uint32_t
-tx_desc_vlan_flags_to_cmdtype(uint32_t ol_flags)
+tx_desc_ol_flags_to_cmdtype(uint32_t ol_flags)
{
- static const uint32_t vlan_cmd[2] = {0, IXGBE_ADVTXD_DCMD_VLE};
- return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
+ uint32_t cmdtype = 0;
+ if (ol_flags & PKT_TX_VLAN_PKT)
+ cmdtype |= IXGBE_ADVTXD_DCMD_VLE;
+ if (ol_flags & PKT_TX_TCP_SEG)
+ cmdtype |= IXGBE_ADVTXD_DCMD_TSE;
+ return cmdtype;
}
/* Default RS bit threshold values */
@@ -583,7 +651,8 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
/* If hardware offload required */
tx_ol_req = ol_flags &
- (PKT_TX_VLAN_PKT | PKT_TX_IP_CKSUM | PKT_TX_L4_MASK);
+ (PKT_TX_VLAN_PKT | PKT_TX_IP_CKSUM | PKT_TX_L4_MASK |
+ PKT_TX_TCP_SEG);
if (tx_ol_req) {
/* If new context need be built or reuse the exist ctx. */
ctx = what_advctx_update(txq, tx_ol_req,
@@ -702,13 +771,26 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
*/
cmd_type_len = IXGBE_ADVTXD_DTYP_DATA |
IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT;
- olinfo_status = (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
+
#ifdef RTE_LIBRTE_IEEE1588
if (ol_flags & PKT_TX_IEEE1588_TMST)
cmd_type_len |= IXGBE_ADVTXD_MAC_1588;
#endif
+ olinfo_status = 0;
if (tx_ol_req) {
+
+ if (ol_flags & PKT_TX_TCP_SEG) {
+ /* paylen in descriptor is the not the packet
+ * len bu the tcp payload len if TSO in on */
+ pkt_len -= (hw_offload.l2_len +
+ hw_offload.l3_len + hw_offload.l4_len);
+
+ /* the pseudo header checksum must be modified:
+ * it should not include the ip_len */
+ ixgbe_fix_tcp_phdr_cksum(tx_pkt);
+ }
+
/*
* Setup the TX Advanced Context Descriptor if required
*/
@@ -741,11 +823,13 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
* This path will go through
* whatever new/reuse the context descriptor
*/
- cmd_type_len |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
+ cmd_type_len |= tx_desc_ol_flags_to_cmdtype(ol_flags);
olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
olinfo_status |= ctx << IXGBE_ADVTXD_IDX_SHIFT;
}
+ olinfo_status |= (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
+
m_seg = tx_pkt;
do {
txd = &txr[tx_id];
@@ -3420,9 +3504,10 @@ ixgbe_dev_tx_init(struct rte_eth_dev *dev)
PMD_INIT_FUNC_TRACE();
hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
- /* Enable TX CRC (checksum offload requirement) */
+ /* Enable TX CRC (checksum offload requirement) and hw padding
+ * (TSO requirement) */
hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
- hlreg0 |= IXGBE_HLREG0_TXCRCEN;
+ hlreg0 |= (IXGBE_HLREG0_TXCRCEN | IXGBE_HLREG0_TXPADEN);
IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
/* Setup the Base and Length of the Tx Descriptor Rings */
--
1.9.2
next prev parent reply other threads:[~2014-05-19 13:57 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-05-19 13:56 [dpdk-dev] [PATCH v2 00/17] ixgbe/mbuf: add TSO support Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 01/17] igb/ixgbe: fix IP checksum calculation Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 02/17] mbuf: rename RTE_MBUF_SCATTER_GATHER into RTE_MBUF_REFCNT Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 03/17] mbuf: remove rte_ctrlmbuf Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 04/17] mbuf: remove the rte_pktmbuf structure Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 05/17] mbuf: merge physaddr and buf_len in a bitfield Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 06/17] mbuf: cosmetic changes in rte_mbuf structure Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 07/17] mbuf: replace data pointer by an offset Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 08/17] mbuf: add functions to get the name of an ol_flag Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 09/17] mbuf: change ol_flags to 32 bits Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 10/17] mbuf: rename vlan_macip_len in hw_offload and increase its size Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 11/17] testpmd: modify source address to validate checksum calculation Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [PATCH v2 12/17] mbuf: generic support of TCP segmentation offload Olivier Matz
2014-05-19 13:56 ` Olivier Matz [this message]
2014-05-19 13:56 ` [dpdk-dev] [virtio-net-pmd PATCH v2 14/17] pmd: adapt to new rte_mbuf structure Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [vmxnet3-usermap PATCH v2 15/17] pmd: remove support of old dpdk versions Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [vmxnet3-usermap PATCH v2 16/17] pmd: adapt to new rte_mbuf structure Olivier Matz
2014-05-19 13:56 ` [dpdk-dev] [memnic PATCH v2 17/17] " Olivier Matz
2014-05-22 15:02 ` [dpdk-dev] [PATCH v2 00/17] add TSO support Thomas Monjalon
2014-05-22 16:09 ` Venkatesan, Venky
2014-05-23 14:22 ` Olivier MATZ
2014-05-23 14:43 ` Venkatesan, Venky
2014-05-26 11:59 ` Olivier MATZ
2014-05-23 12:47 ` Ananyev, Konstantin
2014-05-23 14:32 ` Olivier MATZ
2014-05-26 15:20 ` Ananyev, Konstantin
2014-11-03 7:32 ` [dpdk-dev] [PATCH v2 00/17] ixgbe/mbuf: " Liu, Jijiang
2014-11-03 10:12 ` Olivier MATZ
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1400507789-18453-14-git-send-email-olivier.matz@6wind.com \
--to=olivier.matz@6wind.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).