From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id B8308A0552; Fri, 21 Oct 2022 04:34:37 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 999EE42836; Fri, 21 Oct 2022 04:34:37 +0200 (CEST) Received: from mga17.intel.com (mga17.intel.com [192.55.52.151]) by mails.dpdk.org (Postfix) with ESMTP id 824C24282D; Fri, 21 Oct 2022 04:34:35 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1666319675; x=1697855675; h=from:to:cc:subject:date:message-id:mime-version: content-transfer-encoding; bh=/X23B2DJrblo9yGQpsMrZafmlUBFu3XXCKyJCKBhWzM=; b=kqeb9A5xQSaLVoRbuwI8ure4mJ2hCgZHh4EvmiStEO71y/RH3pnnR0xi G6qSgjUsYxQzlDRsktH71/xBYz8a1C31yvlC3v8KOsZ3v4h7FRCNsy5gA /N/wdSp3lsrLVsrWzGb6Hr2Uau1Vlm8KltDtEeEl1BBNBe99i9gUxRYVR 9RO4QVeThvT7Ngf585w+mx/NL/ZM7IRmdvRgBjwnEkLsY43Q3xuzU99gr YwchU+KIOe4lixWI3Pn+jy5ALZ1t5MHqpYerkdUrIHR1wpe+yvPeimrkV a58+lkmMml4SSpkUWeKAUkq1r/NyFMXnObvn2emKaYLGg0vg1NhmfSxAj Q==; X-IronPort-AV: E=McAfee;i="6500,9779,10506"; a="287292777" X-IronPort-AV: E=Sophos;i="5.95,200,1661842800"; d="scan'208";a="287292777" Received: from orsmga001.jf.intel.com ([10.7.209.18]) by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Oct 2022 19:34:34 -0700 X-IronPort-AV: E=McAfee;i="6500,9779,10506"; a="663454211" X-IronPort-AV: E=Sophos;i="5.95,200,1661842800"; d="scan'208";a="663454211" Received: from unknown (HELO yemj..) ([10.239.252.253]) by orsmga001-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Oct 2022 19:34:23 -0700 From: Mingjin Ye To: dev@dpdk.org Cc: stable@dpdk.org, yidingx.zhou@intel.com, Mingjin Ye , Yuying Zhang , Beilei Xing , Bruce Richardson , Konstantin Ananyev , Jing Chen , Heqing Zhu , Jijiang Liu , Helin Zhang , Jingjing Wu , Ajit Khaparde , Andrew Rybchenko , Olivier Matz , Somnath Kotur , Leyi Rong , Wenzhuo Lu , Qi Zhang , Jeff Shaw , Damjan Marion , Jianbo Liu Subject: [PATCH] net/i40e: fix outer checksum flags Date: Fri, 21 Oct 2022 10:21:51 +0000 Message-Id: <20221021102151.1000400-1-mingjinx.ye@intel.com> X-Mailer: git-send-email 2.34.1 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org When receiving tunnel packets, the testpmd terminal output log shows 'ol_flags' value always as 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_UNKNOWN', but what we expected should be 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD' or 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD'. Adding 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD' and 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD' flags for normal path, which are also added to 'l3_l4_flags_shuf' for AVX2 and AVX512 vector path and 'cksum_flags' for SSE vector path to ensure the 'ol_flags' can match correct flags. Fixes: 4861cde46116 ("i40e: new poll mode driver") Fixes: daa02b5cddbb ("mbuf: add namespace to offload flags") Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path") Fixes: dafadd73762e ("net/i40e: add AVX2 Rx function") Fixes: 9966a00a0688 ("net/i40e: enable bad checksum flags in vector Rx") Fixes: f3a85f4ce04d ("net/i40e: fix checksum flag in x86 vector Rx") Fixes: f4356d7ca168 ("net/i40e: eliminate mbuf write on rearm") Cc: stable@dpdk.org Signed-off-by: Mingjin Ye --- drivers/net/i40e/i40e_rxtx.c | 6 +- drivers/net/i40e/i40e_rxtx_vec_avx2.c | 119 +++++++++++++++++------- drivers/net/i40e/i40e_rxtx_vec_avx512.c | 113 ++++++++++++++++------ drivers/net/i40e/i40e_rxtx_vec_sse.c | 73 ++++++++++----- 4 files changed, 228 insertions(+), 83 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 788ffb51c2..e8903007f0 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -169,7 +169,9 @@ i40e_rxd_error_to_pkt_flags(uint64_t qword) #define I40E_RX_ERR_BITS 0x3f if (likely((error_bits & I40E_RX_ERR_BITS) == 0)) { - flags |= (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD); + flags |= (RTE_MBUF_F_RX_IP_CKSUM_GOOD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD); return flags; } @@ -185,6 +187,8 @@ i40e_rxd_error_to_pkt_flags(uint64_t qword) if (unlikely(error_bits & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT))) flags |= RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD; + else + flags |= RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD; return flags; } diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c index 761edb9d20..348dad4293 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c +++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c @@ -202,7 +202,7 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, * field (bits 22-24) are for IP/L4 checksum errors */ const __m256i flags_mask = _mm256_set1_epi32( - (1 << 2) | (1 << 11) | (3 << 12) | (7 << 22)); + (0xF << 4) | (1 << 12) | (1 << 13)); /* * data to be shuffled by result of flag mask. If VLAN bit is set, * (bit 2), then position 4 in this array will be used in the @@ -228,39 +228,83 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, * data to be shuffled by the result of the flags mask shifted by 22 * bits. This gives use the l3_l4 flags. */ - const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, - /* shift right 1 bit to make sure it not exceed 255 */ - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - /* second 128-bits */ - 0, 0, 0, 0, 0, 0, 0, 0, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); + const __m256i l3_l4_flags_shuf = + _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + /** + * second 128-bits + * shift right 20 bits to use the low two bits to indicate + * outer checksum status + * shift right 1 bit to make sure it not exceed 255 + */ + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); const __m256i cksum_mask = _mm256_set1_epi32( - RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | - RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); + RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */ @@ -407,6 +451,17 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, _mm256_srli_epi32(flag_bits, 22)); l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); + + __m256i l4_outer_mask = _mm256_set1_epi32(0x6); + __m256i l4_outer_flags = + _mm256_and_si256(l3_l4_flags, l4_outer_mask); + l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); + + __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); + + l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); + l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); + l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); /* merge flags */ diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c index 60c97d5331..94fd309748 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c @@ -313,7 +313,7 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, * field (bits 22-24) are for IP/L4 checksum errors */ const __m256i flags_mask = _mm256_set1_epi32 - ((1 << 2) | (1 << 11) | (3 << 12) | (7 << 22)); + ((0xF << 4) | (1 << 12) | (1 << 13)); /* data to be shuffled by result of flag mask. If VLAN bit is set, * (bit 2), then position 4 in this array will be used in the @@ -339,35 +339,83 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, * bits. This gives use the l3_l4 flags. */ const __m256i l3_l4_flags_shuf = _mm256_set_epi8 - (0, 0, 0, 0, 0, 0, 0, 0, - /* shift right 1 bit to make sure it not exceed 255 */ - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | - RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, - RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1, - /* second 128-bits */ - 0, 0, 0, 0, 0, 0, 0, 0, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | - RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1, - RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1, - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1); + ((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + /** + * second 128-bits + * shift right 20 bits to use the low two bits to indicate + * outer checksum status + * shift right 1 bit to make sure it not exceed 255 + */ + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); const __m256i cksum_mask = _mm256_set1_epi32 - (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | - RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); + (RTE_MBUF_F_RX_IP_CKSUM_MASK | + RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); uint16_t i, received; @@ -535,6 +583,15 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, _mm256_srli_epi32(flag_bits, 22)); l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); + __m256i l4_outer_mask = _mm256_set1_epi32(0x6); + __m256i l4_outer_flags = + _mm256_and_si256(l3_l4_flags, l4_outer_mask); + l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); + + __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); + + l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); + l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); /* merge flags */ diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c index bdc979a839..65630e40a2 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_sse.c +++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c @@ -230,16 +230,16 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile union i40e_rx_desc *rxdp, const __m128i cksum_mask = _mm_set_epi32( RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); /* map rss and vlan type to rss hash and vlan flag */ const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, @@ -252,20 +252,45 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile union i40e_rx_desc *rxdp, RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH, 0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0); - const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, - /* shift right 1 bit to make sure it not exceed 255 */ - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); + const __m128i l3_l4e_flags = + _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + /** + * shift right 20 bits to use the low two bits to indicate + * outer checksum status + * shift right 1 bit to make sure it not exceed 255 + */ + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); /* Unpack "status" from quadword 1, bits 0:32 */ vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]); @@ -282,6 +307,10 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile union i40e_rx_desc *rxdp, l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e); /* then we shift left 1 bit */ l3_l4e = _mm_slli_epi32(l3_l4e, 1); + __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6); + __m128i l4_outer_flags = _mm_and_si128(l3_l4e, l4_outer_mask); + l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20); + vlan0 = _mm_or_si128(vlan0, l4_outer_flags); /* we need to mask out the redundant bits */ l3_l4e = _mm_and_si128(l3_l4e, cksum_mask); @@ -310,10 +339,10 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile union i40e_rx_desc *rxdp, * appropriate flags means that we have to do a shift and blend for * each mbuf before we do the write. */ - rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10); - rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10); - rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10); - rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10); + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x30); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x30); + rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x30); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x30); /* write the rearm data and the olflags in one write */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != -- 2.34.1