From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id E2D93A0597; Wed, 8 Apr 2020 08:32:01 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 6457F1C025; Wed, 8 Apr 2020 08:31:25 +0200 (CEST) Received: from mga07.intel.com (mga07.intel.com [134.134.136.100]) by dpdk.org (Postfix) with ESMTP id 7BA3C1BFD9 for ; Wed, 8 Apr 2020 08:31:23 +0200 (CEST) IronPort-SDR: zLSBsC4zYsQLl/QkB9T/gwd+4qOZc3UrdCp06tmWdMsXEDAj8CDQHaRHuUUnJoCf6FRuiLOUql DvExsGnnuKfQ== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga003.jf.intel.com ([10.7.209.27]) by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 07 Apr 2020 23:31:23 -0700 IronPort-SDR: O5+ke93RwWJ2HFpsYa2NmemyF28iIt2uMughINFPonj/UejWpDr4b7ZRKfUKHY6ja4kB//OFCp 0R4YCh9PzUNA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.72,357,1580803200"; d="scan'208";a="251474406" Received: from dpdk-lrong-srv-04.sh.intel.com ([10.67.119.221]) by orsmga003.jf.intel.com with ESMTP; 07 Apr 2020 23:31:21 -0700 From: Leyi Rong To: jingjing.wu@intel.com, qi.z.zhang@intel.com, beilei.xing@intel.com, xiaolong.ye@intel.com Cc: dev@dpdk.org, Leyi Rong Date: Wed, 8 Apr 2020 14:22:04 +0800 Message-Id: <20200408062210.34003-6-leyi.rong@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200408062210.34003-1-leyi.rong@intel.com> References: <20200316074603.10998-1-leyi.rong@intel.com> <20200408062210.34003-1-leyi.rong@intel.com> Subject: [dpdk-dev] [PATCH v3 05/11] net/iavf: flexible Rx descriptor support in SSE path X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Support flexible Rx descriptor format in SSE path of iAVF PMD. Signed-off-by: Leyi Rong --- drivers/net/iavf/iavf_rxtx.c | 4 +- drivers/net/iavf/iavf_rxtx.h | 5 + drivers/net/iavf/iavf_rxtx_vec_sse.c | 414 +++++++++++++++++++++++++++ 3 files changed, 421 insertions(+), 2 deletions(-) diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 34c41d104..09ce5e3f3 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -2085,7 +2085,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev) VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) dev->rx_pkt_burst = use_avx2 ? iavf_recv_scattered_pkts_vec_avx2_flex_rxd : - iavf_recv_scattered_pkts_vec; + iavf_recv_scattered_pkts_vec_flex_rxd; else dev->rx_pkt_burst = use_avx2 ? iavf_recv_scattered_pkts_vec_avx2 : @@ -2098,7 +2098,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev) VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) dev->rx_pkt_burst = use_avx2 ? iavf_recv_pkts_vec_avx2_flex_rxd : - iavf_recv_pkts_vec; + iavf_recv_pkts_vec_flex_rxd; else dev->rx_pkt_burst = use_avx2 ? iavf_recv_pkts_vec_avx2 : diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h index 8e1db2588..290dd68c1 100644 --- a/drivers/net/iavf/iavf_rxtx.h +++ b/drivers/net/iavf/iavf_rxtx.h @@ -406,9 +406,14 @@ int iavf_dev_tx_desc_status(void *tx_queue, uint16_t offset); uint16_t iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); +uint16_t iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); uint16_t iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); +uint16_t iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); uint16_t iavf_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); uint16_t iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c index 0365c49e1..9c1f2a445 100644 --- a/drivers/net/iavf/iavf_rxtx_vec_sse.c +++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c @@ -189,6 +189,109 @@ desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); } +static inline void +flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], + struct rte_mbuf **rx_pkts) +{ + const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); + __m128i rearm0, rearm1, rearm2, rearm3; + + __m128i tmp_desc, flags, rss_vlan; + + /* mask everything except checksum, RSS and VLAN flags. + * bit6:4 for checksum. + * bit12 for RSS indication. + * bit13 for VLAN indication. + */ + const __m128i desc_mask = _mm_set_epi32(0x3070, 0x3070, + 0x3070, 0x3070); + + const __m128i cksum_mask = _mm_set_epi32(PKT_RX_IP_CKSUM_MASK | + PKT_RX_L4_CKSUM_MASK | + PKT_RX_EIP_CKSUM_BAD, + PKT_RX_IP_CKSUM_MASK | + PKT_RX_L4_CKSUM_MASK | + PKT_RX_EIP_CKSUM_BAD, + PKT_RX_IP_CKSUM_MASK | + PKT_RX_L4_CKSUM_MASK | + PKT_RX_EIP_CKSUM_BAD, + PKT_RX_IP_CKSUM_MASK | + PKT_RX_L4_CKSUM_MASK | + PKT_RX_EIP_CKSUM_BAD); + + /* map the checksum, rss and vlan fields to the checksum, rss + * and vlan flag + */ + const __m128i cksum_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, + /* shift right 1 bit to make sure it not exceed 255 */ + (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | + PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | + PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1); + + const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, + PKT_RX_RSS_HASH, 0); + + /* merge 4 descriptors */ + flags = _mm_unpackhi_epi32(descs[0], descs[1]); + tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]); + tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc); + tmp_desc = _mm_and_si128(flags, desc_mask); + + /* checksum flags */ + tmp_desc = _mm_srli_epi32(tmp_desc, 4); + flags = _mm_shuffle_epi8(cksum_flags, tmp_desc); + /* then we shift left 1 bit */ + flags = _mm_slli_epi32(flags, 1); + /* we need to mask out the reduntant bits introduced by RSS or + * VLAN fields. + */ + flags = _mm_and_si128(flags, cksum_mask); + + /* RSS, VLAN flag */ + tmp_desc = _mm_srli_epi32(tmp_desc, 8); + rss_vlan = _mm_shuffle_epi8(rss_vlan_flags, tmp_desc); + + /* merge the flags */ + flags = _mm_or_si128(flags, rss_vlan); + + /** + * At this point, we have the 4 sets of flags in the low 16-bits + * of each 32-bit value in flags. + * We want to extract these, and merge them with the mbuf init data + * so we can do a single 16-byte write to the mbuf to set the flags + * and all the other initialization fields. Extracting the + * appropriate flags means that we have to do a shift and blend for + * each mbuf before we do the write. + */ + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x10); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x10); + rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x10); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x10); + + /* write the rearm data and the olflags in one write */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != + offsetof(struct rte_mbuf, rearm_data) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != + RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); + _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); + _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); + _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); + _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); +} + #define PKTLEN_SHIFT 10 static inline void @@ -207,6 +310,26 @@ desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, rx_pkts[3]->packet_type = type_table[_mm_extract_epi8(ptype1, 8)]; } +static inline void +flex_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, + const uint32_t *type_table) +{ + const __m128i ptype_mask = _mm_set_epi16(0, IAVF_RX_FLEX_DESC_PTYPE_M, + 0, IAVF_RX_FLEX_DESC_PTYPE_M, + 0, IAVF_RX_FLEX_DESC_PTYPE_M, + 0, IAVF_RX_FLEX_DESC_PTYPE_M); + __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]); + __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]); + __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23); + + ptype_all = _mm_and_si128(ptype_all, ptype_mask); + + rx_pkts[0]->packet_type = type_table[_mm_extract_epi16(ptype_all, 1)]; + rx_pkts[1]->packet_type = type_table[_mm_extract_epi16(ptype_all, 3)]; + rx_pkts[2]->packet_type = type_table[_mm_extract_epi16(ptype_all, 5)]; + rx_pkts[3]->packet_type = type_table[_mm_extract_epi16(ptype_all, 7)]; +} + /* Notice: * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST @@ -455,6 +578,243 @@ _recv_raw_pkts_vec(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, return nb_pkts_recd; } +/* Notice: + * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet + * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST + * numbers of DD bits + */ +static inline uint16_t +_recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts, uint8_t *split_packet) +{ + volatile union iavf_rx_flex_desc *rxdp; + struct rte_mbuf **sw_ring; + uint16_t nb_pkts_recd; + int pos; + uint64_t var; + const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; + __m128i crc_adjust = _mm_set_epi16 + (0, 0, 0, /* ignore non-length fields */ + -rxq->crc_len, /* sub crc on data_len */ + 0, /* ignore high-16bits of pkt_len */ + -rxq->crc_len, /* sub crc on pkt_len */ + 0, 0 /* ignore pkt_type field */ + ); + const __m128i zero = _mm_setzero_si128(); + /* mask to shuffle from desc. to mbuf */ + const __m128i shuf_msk = _mm_set_epi8 + (15, 14, 13, 12, /* octet 12~15, 32 bits rss */ + 11, 10, /* octet 10~11, 16 bits vlan_macip */ + 5, 4, /* octet 4~5, 16 bits data_len */ + 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ + 5, 4, /* octet 4~5, low 16 bits pkt_len */ + 0xFF, 0xFF, /* pkt_type set as unknown */ + 0xFF, 0xFF /* pkt_type set as unknown */ + ); + const __m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF, + 0xFF, 0xFF, + 0xFF, 0xFF, + 0xFF, 0xFF, + 0xFF, 0xFF, + 0xFF, 0xFF, + 0x04, 0x0C, + 0x00, 0x08); + + /** + * compile-time check the above crc_adjust layout is correct. + * NOTE: the first field (lowest address) is given last in set_epi16 + * call above. + */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + + /* 4 packets DD mask */ + const __m128i dd_check = _mm_set_epi64x(0x0000000100000001LL, + 0x0000000100000001LL); + /* 4 packets EOP mask */ + const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL, + 0x0000000200000002LL); + + /* nb_pkts shall be less equal than IAVF_VPMD_RX_MAX_BURST */ + nb_pkts = RTE_MIN(nb_pkts, IAVF_VPMD_RX_MAX_BURST); + + /* nb_pkts has to be floor-aligned to IAVF_VPMD_DESCS_PER_LOOP */ + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_VPMD_DESCS_PER_LOOP); + + /* Just the act of getting into the function from the application is + * going to cost about 7 cycles + */ + rxdp = (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail; + + rte_prefetch0(rxdp); + + /* See if we need to rearm the RX queue - gives the prefetch a bit + * of time to act + */ + if (rxq->rxrearm_nb > rxq->rx_free_thresh) + iavf_rxq_rearm(rxq); + + /* Before we start moving massive data around, check to see if + * there is actually a packet available + */ + if (!(rxdp->wb.status_error0 & + rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S))) + return 0; + + /** + * Compile-time verify the shuffle mask + * NOTE: some field positions already verified above, but duplicated + * here for completeness in case of future modifications. + */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); + + /* Cache is empty -> need to scan the buffer rings, but first move + * the next 'n' mbufs into the cache + */ + sw_ring = &rxq->sw_ring[rxq->rx_tail]; + + /* A. load 4 packet in one loop + * [A*. mask out 4 unused dirty field in desc] + * B. copy 4 mbuf point from swring to rx_pkts + * C. calc the number of DD bits among the 4 packets + * [C*. extract the end-of-packet bit, if requested] + * D. fill info. from desc to mbuf + */ + + for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; + pos += IAVF_VPMD_DESCS_PER_LOOP, + rxdp += IAVF_VPMD_DESCS_PER_LOOP) { + __m128i descs[IAVF_VPMD_DESCS_PER_LOOP]; + __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; + __m128i staterr, sterr_tmp1, sterr_tmp2; + /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ + __m128i mbp1; +#if defined(RTE_ARCH_X86_64) + __m128i mbp2; +#endif + + /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ + mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); + /* Read desc statuses backwards to avoid race condition */ + /* A.1 load 4 pkts desc */ + descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); + rte_compiler_barrier(); + + /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ + _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); + +#if defined(RTE_ARCH_X86_64) + /* B.1 load 2 64 bit mbuf points */ + mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]); +#endif + + descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); + rte_compiler_barrier(); + /* B.1 load 2 mbuf point */ + descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); + rte_compiler_barrier(); + descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); + +#if defined(RTE_ARCH_X86_64) + /* B.2 copy 2 mbuf point into rx_pkts */ + _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2); +#endif + + if (split_packet) { + rte_mbuf_prefetch_part2(rx_pkts[pos]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); + } + + /* avoid compiler reorder optimization */ + rte_compiler_barrier(); + + /* D.1 pkt 3,4 convert format from desc to pktmbuf */ + pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); + pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); + + /* C.1 4=>2 filter staterr info only */ + sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); + /* C.1 4=>2 filter staterr info only */ + sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); + + flex_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); + + /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ + pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); + pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); + + /* D.1 pkt 1,2 convert format from desc to pktmbuf */ + pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); + pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); + + /* C.2 get 4 pkts staterr value */ + staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); + + /* D.3 copy final 3,4 data to rx_pkts */ + _mm_storeu_si128 + ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, + pkt_mb4); + _mm_storeu_si128 + ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, + pkt_mb3); + + /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ + pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); + pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); + + /* C* extract and record EOP bit */ + if (split_packet) { + /* and with mask to extract bits, flipping 1-0 */ + __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); + /* the staterr values are not in order, as the count + * count of dd bits doesn't care. However, for end of + * packet tracking, we do care, so shuffle. This also + * compresses the 32-bit values to 8-bit + */ + eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); + /* store the resulting 32-bit value */ + *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); + split_packet += IAVF_VPMD_DESCS_PER_LOOP; + } + + /* C.3 calc available number of desc */ + staterr = _mm_and_si128(staterr, dd_check); + staterr = _mm_packs_epi32(staterr, zero); + + /* D.3 copy final 1,2 data to rx_pkts */ + _mm_storeu_si128 + ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, + pkt_mb2); + _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, + pkt_mb1); + flex_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); + /* C.4 calc avaialbe number of desc */ + var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); + nb_pkts_recd += var; + if (likely(var != IAVF_VPMD_DESCS_PER_LOOP)) + break; + } + + /* Update our internal tail pointer */ + rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); + rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); + rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); + + return nb_pkts_recd; +} + /* Notice: * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST @@ -467,6 +827,18 @@ iavf_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); } +/* Notice: + * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet + * - nb_pkts > IAVF_VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST + * numbers of DD bits + */ +uint16_t +iavf_recv_pkts_vec_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + return _recv_raw_pkts_vec_flex_rxd(rx_queue, rx_pkts, nb_pkts, NULL); +} + /* vPMD receive routine that reassembles scattered packets * Notice: * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet @@ -508,6 +880,48 @@ iavf_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, &split_flags[i]); } +/* vPMD receive routine that reassembles scattered packets for flex RxD + * Notice: + * - nb_pkts < IAVF_VPMD_DESCS_PER_LOOP, just return no packet + * - nb_pkts > VPMD_RX_MAX_BURST, only scan IAVF_VPMD_RX_MAX_BURST + * numbers of DD bits + */ +uint16_t +iavf_recv_scattered_pkts_vec_flex_rxd(void *rx_queue, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct iavf_rx_queue *rxq = rx_queue; + uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0}; + unsigned int i = 0; + + /* get some new buffers */ + uint16_t nb_bufs = _recv_raw_pkts_vec_flex_rxd(rxq, rx_pkts, nb_pkts, + split_flags); + if (nb_bufs == 0) + return 0; + + /* happy day case, full burst + no packets to be joined */ + const uint64_t *split_fl64 = (uint64_t *)split_flags; + + if (!rxq->pkt_first_seg && + split_fl64[0] == 0 && split_fl64[1] == 0 && + split_fl64[2] == 0 && split_fl64[3] == 0) + return nb_bufs; + + /* reassemble any packets that need reassembly*/ + if (!rxq->pkt_first_seg) { + /* find the first split flag, and only reassemble then*/ + while (i < nb_bufs && !split_flags[i]) + i++; + if (i == nb_bufs) + return nb_bufs; + rxq->pkt_first_seg = rx_pkts[i]; + } + return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i, + &split_flags[i]); +} + static inline void vtx1(volatile struct iavf_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) { -- 2.17.1