DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86
@ 2016-04-20 13:44 Jianbo Liu
  2016-04-20 13:45 ` [dpdk-dev] [PATCH 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
                   ` (8 more replies)
  0 siblings, 9 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-20 13:44 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

move SSE-dependent code to new file "ixgbe_rxtx_vec_sse.h"

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec.c     | 369 +----------------------------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h | 408 +++++++++++++++++++++++++++++++++
 2 files changed, 409 insertions(+), 368 deletions(-)
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index 5040704..064a00b 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -38,364 +38,7 @@
 #include "ixgbe_ethdev.h"
 #include "ixgbe_rxtx.h"
 
-#include <tmmintrin.h>
-
-#ifndef __INTEL_COMPILER
-#pragma GCC diagnostic ignored "-Wcast-qual"
-#endif
-
-static inline void
-ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
-{
-	int i;
-	uint16_t rx_id;
-	volatile union ixgbe_adv_rx_desc *rxdp;
-	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-	struct rte_mbuf *mb0, *mb1;
-	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-			RTE_PKTMBUF_HEADROOM);
-	__m128i dma_addr0, dma_addr1;
-
-	const __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX);
-
-	rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-	/* Pull 'n' more MBUFs into the software ring */
-	if (rte_mempool_get_bulk(rxq->mb_pool,
-				 (void *)rxep,
-				 RTE_IXGBE_RXQ_REARM_THRESH) < 0) {
-		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
-		    rxq->nb_rx_desc) {
-			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
-				rxep[i].mbuf = &rxq->fake_mbuf;
-				_mm_store_si128((__m128i *)&rxdp[i].read,
-						dma_addr0);
-			}
-		}
-		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-			RTE_IXGBE_RXQ_REARM_THRESH;
-		return;
-	}
-
-	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
-	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-		__m128i vaddr0, vaddr1;
-		uintptr_t p0, p1;
-
-		mb0 = rxep[0].mbuf;
-		mb1 = rxep[1].mbuf;
-
-		/*
-		 * Flush mbuf with pkt template.
-		 * Data to be rearmed is 6 bytes long.
-		 * Though, RX will overwrite ol_flags that are coming next
-		 * anyway. So overwrite whole 8 bytes with one load:
-		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
-		 */
-		p0 = (uintptr_t)&mb0->rearm_data;
-		*(uint64_t *)p0 = rxq->mbuf_initializer;
-		p1 = (uintptr_t)&mb1->rearm_data;
-		*(uint64_t *)p1 = rxq->mbuf_initializer;
-
-		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
-		vaddr0 = _mm_loadu_si128((__m128i *)&(mb0->buf_addr));
-		vaddr1 = _mm_loadu_si128((__m128i *)&(mb1->buf_addr));
-
-		/* convert pa to dma_addr hdr/data */
-		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-		/* add headroom to pa values */
-		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-		/* set Header Buffer Address to zero */
-		dma_addr0 =  _mm_and_si128(dma_addr0, hba_msk);
-		dma_addr1 =  _mm_and_si128(dma_addr1, hba_msk);
-
-		/* flush desc with pa dma_addr */
-		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-	}
-
-	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
-	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
-		rxq->rxrearm_start = 0;
-
-	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;
-
-	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
-			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
-
-	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
-}
-
-/* Handling the offload flags (olflags) field takes computation
- * time when receiving packets. Therefore we provide a flag to disable
- * the processing of the olflags field when they are not needed. This
- * gives improved performance, at the cost of losing the offload info
- * in the received packet
- */
-#ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
-
-#define VTAG_SHIFT     (3)
-
-static inline void
-desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
-{
-	__m128i ptype0, ptype1, vtag0, vtag1;
-	union {
-		uint16_t e[4];
-		uint64_t dword;
-	} vol;
-
-	/* pkt type + vlan olflags mask */
-	const __m128i pkttype_msk = _mm_set_epi16(
-			0x0000, 0x0000, 0x0000, 0x0000,
-			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT);
-
-	/* mask everything except rss type */
-	const __m128i rsstype_msk = _mm_set_epi16(
-			0x0000, 0x0000, 0x0000, 0x0000,
-			0x000F, 0x000F, 0x000F, 0x000F);
-
-	/* map rss type to rss hash flag */
-	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
-			0, 0, 0, PKT_RX_RSS_HASH,
-			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
-			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);
-
-	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
-	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
-	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
-	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);
-
-	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
-	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
-	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);
-
-	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
-	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);
-	vtag1 = _mm_and_si128(vtag1, pkttype_msk);
-
-	vtag1 = _mm_or_si128(ptype0, vtag1);
-	vol.dword = _mm_cvtsi128_si64(vtag1);
-
-	rx_pkts[0]->ol_flags = vol.e[0];
-	rx_pkts[1]->ol_flags = vol.e[1];
-	rx_pkts[2]->ol_flags = vol.e[2];
-	rx_pkts[3]->ol_flags = vol.e[3];
-}
-#else
-#define desc_to_olflags_v(desc, rx_pkts) do {} while (0)
-#endif
-
-/*
- * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
- *
- * Notice:
- * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
- *   numbers of DD bit
- * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
- * - don't support ol_flags for rss and csum err
- */
-static inline uint16_t
-_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
-		uint16_t nb_pkts, uint8_t *split_packet)
-{
-	volatile union ixgbe_adv_rx_desc *rxdp;
-	struct ixgbe_rx_entry *sw_ring;
-	uint16_t nb_pkts_recd;
-	int pos;
-	uint64_t var;
-	__m128i shuf_msk;
-	__m128i crc_adjust = _mm_set_epi16(
-				0, 0, 0,    /* ignore non-length fields */
-				-rxq->crc_len, /* sub crc on data_len */
-				0,          /* ignore high-16bits of pkt_len */
-				-rxq->crc_len, /* sub crc on pkt_len */
-				0, 0            /* ignore pkt_type field */
-			);
-	__m128i dd_check, eop_check;
-
-	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
-
-	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
-
-	/* Just the act of getting into the function from the application is
-	 * going to cost about 7 cycles */
-	rxdp = rxq->rx_ring + rxq->rx_tail;
-
-	_mm_prefetch((const void *)rxdp, _MM_HINT_T0);
-
-	/* See if we need to rearm the RX queue - gives the prefetch a bit
-	 * of time to act */
-	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
-		ixgbe_rxq_rearm(rxq);
-
-	/* Before we start moving massive data around, check to see if
-	 * there is actually a packet available */
-	if (!(rxdp->wb.upper.status_error &
-				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
-		return 0;
-
-	/* 4 packets DD mask */
-	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);
-
-	/* 4 packets EOP mask */
-	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);
-
-	/* mask to shuffle from desc. to mbuf */
-	shuf_msk = _mm_set_epi8(
-		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
-		15, 14,      /* octet 14~15, low 16 bits vlan_macip */
-		13, 12,      /* octet 12~13, 16 bits data_len */
-		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
-		13, 12,      /* octet 12~13, low 16 bits pkt_len */
-		0xFF, 0xFF,  /* skip 32 bit pkt_type */
-		0xFF, 0xFF
-		);
-
-	/* Cache is empty -> need to scan the buffer rings, but first move
-	 * the next 'n' mbufs into the cache */
-	sw_ring = &rxq->sw_ring[rxq->rx_tail];
-
-	/* A. load 4 packet in one loop
-	 * [A*. mask out 4 unused dirty field in desc]
-	 * B. copy 4 mbuf point from swring to rx_pkts
-	 * C. calc the number of DD bits among the 4 packets
-	 * [C*. extract the end-of-packet bit, if requested]
-	 * D. fill info. from desc to mbuf
-	 */
-	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
-			pos += RTE_IXGBE_DESCS_PER_LOOP,
-			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
-		__m128i descs[RTE_IXGBE_DESCS_PER_LOOP];
-		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
-		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
-		__m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */
-
-		/* B.1 load 1 mbuf point */
-		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);
-
-		/* Read desc statuses backwards to avoid race condition */
-		/* A.1 load 4 pkts desc */
-		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));
-
-		/* B.2 copy 2 mbuf point into rx_pkts  */
-		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);
-
-		/* B.1 load 1 mbuf point */
-		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);
-
-		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
-		/* B.1 load 2 mbuf point */
-		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
-		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));
-
-		/* B.2 copy 2 mbuf point into rx_pkts  */
-		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
-
-		if (split_packet) {
-			rte_prefetch0(&rx_pkts[pos]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
-			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
-		}
-
-		/* avoid compiler reorder optimization */
-		rte_compiler_barrier();
-
-		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
-		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
-		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
-
-		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
-		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
-		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);
-
-		/* C.1 4=>2 filter staterr info only */
-		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
-		/* C.1 4=>2 filter staterr info only */
-		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
-
-		/* set ol_flags with vlan packet type */
-		desc_to_olflags_v(descs, &rx_pkts[pos]);
-
-		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
-		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
-		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
-
-		/* C.2 get 4 pkts staterr value  */
-		zero = _mm_xor_si128(dd_check, dd_check);
-		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
-
-		/* D.3 copy final 3,4 data to rx_pkts */
-		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
-				pkt_mb4);
-		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
-				pkt_mb3);
-
-		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
-		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
-		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
-
-		/* C* extract and record EOP bit */
-		if (split_packet) {
-			__m128i eop_shuf_mask = _mm_set_epi8(
-					0xFF, 0xFF, 0xFF, 0xFF,
-					0xFF, 0xFF, 0xFF, 0xFF,
-					0xFF, 0xFF, 0xFF, 0xFF,
-					0x04, 0x0C, 0x00, 0x08
-					);
-
-			/* and with mask to extract bits, flipping 1-0 */
-			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
-			/* the staterr values are not in order, as the count
-			 * count of dd bits doesn't care. However, for end of
-			 * packet tracking, we do care, so shuffle. This also
-			 * compresses the 32-bit values to 8-bit */
-			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
-			/* store the resulting 32-bit value */
-			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
-			split_packet += RTE_IXGBE_DESCS_PER_LOOP;
-
-			/* zero-out next pointers */
-			rx_pkts[pos]->next = NULL;
-			rx_pkts[pos + 1]->next = NULL;
-			rx_pkts[pos + 2]->next = NULL;
-			rx_pkts[pos + 3]->next = NULL;
-		}
-
-		/* C.3 calc available number of desc */
-		staterr = _mm_and_si128(staterr, dd_check);
-		staterr = _mm_packs_epi32(staterr, zero);
-
-		/* D.3 copy final 1,2 data to rx_pkts */
-		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
-				pkt_mb2);
-		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
-				pkt_mb1);
-
-		/* C.4 calc avaialbe number of desc */
-		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
-		nb_pkts_recd += var;
-		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
-			break;
-	}
-
-	/* Update our internal tail pointer */
-	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
-	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
-	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
-
-	return nb_pkts_recd;
-}
+#include "ixgbe_rxtx_vec_sse.h"
 
 /*
  * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
@@ -521,16 +164,6 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 }
 
 static inline void
-vtx1(volatile union ixgbe_adv_tx_desc *txdp,
-		struct rte_mbuf *pkt, uint64_t flags)
-{
-	__m128i descriptor = _mm_set_epi64x((uint64_t)pkt->pkt_len << 46 |
-			flags | pkt->data_len,
-			pkt->buf_physaddr + pkt->data_off);
-	_mm_store_si128((__m128i *)&txdp->read, descriptor);
-}
-
-static inline void
 vtx(volatile union ixgbe_adv_tx_desc *txdp,
 		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
 {
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h
new file mode 100644
index 0000000..8f52778
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h
@@ -0,0 +1,408 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+
+#include "ixgbe_ethdev.h"
+#include "ixgbe_rxtx.h"
+
+#include <tmmintrin.h>
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+static inline void
+ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mbuf *mb0, *mb1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+			RTE_PKTMBUF_HEADROOM);
+	__m128i dma_addr0, dma_addr1;
+
+	const __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX);
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mempool_get_bulk(rxq->mb_pool,
+				 (void *)rxep,
+				 RTE_IXGBE_RXQ_REARM_THRESH) < 0) {
+		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
+				rxep[i].mbuf = &rxq->fake_mbuf;
+				_mm_store_si128((__m128i *)&rxdp[i].read,
+						dma_addr0);
+			}
+		}
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_IXGBE_RXQ_REARM_THRESH;
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		__m128i vaddr0, vaddr1;
+		uintptr_t p0, p1;
+
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		/*
+		 * Flush mbuf with pkt template.
+		 * Data to be rearmed is 6 bytes long.
+		 * Though, RX will overwrite ol_flags that are coming next
+		 * anyway. So overwrite whole 8 bytes with one load:
+		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
+		 */
+		p0 = (uintptr_t)&mb0->rearm_data;
+		*(uint64_t *)p0 = rxq->mbuf_initializer;
+		p1 = (uintptr_t)&mb1->rearm_data;
+		*(uint64_t *)p1 = rxq->mbuf_initializer;
+
+		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+		vaddr0 = _mm_loadu_si128((__m128i *)&(mb0->buf_addr));
+		vaddr1 = _mm_loadu_si128((__m128i *)&(mb1->buf_addr));
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+		/* add headroom to pa values */
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		/* set Header Buffer Address to zero */
+		dma_addr0 =  _mm_and_si128(dma_addr0, hba_msk);
+		dma_addr1 =  _mm_and_si128(dma_addr1, hba_msk);
+
+		/* flush desc with pa dma_addr */
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+	}
+
+	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+}
+
+/* Handling the offload flags (olflags) field takes computation
+ * time when receiving packets. Therefore we provide a flag to disable
+ * the processing of the olflags field when they are not needed. This
+ * gives improved performance, at the cost of losing the offload info
+ * in the received packet
+ */
+#ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
+
+#define VTAG_SHIFT     (3)
+
+static inline void
+desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
+{
+	__m128i ptype0, ptype1, vtag0, vtag1;
+	union {
+		uint16_t e[4];
+		uint64_t dword;
+	} vol;
+
+	/* pkt type + vlan olflags mask */
+	const __m128i pkttype_msk = _mm_set_epi16(
+			0x0000, 0x0000, 0x0000, 0x0000,
+			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT);
+
+	/* mask everything except rss type */
+	const __m128i rsstype_msk = _mm_set_epi16(
+			0x0000, 0x0000, 0x0000, 0x0000,
+			0x000F, 0x000F, 0x000F, 0x000F);
+
+	/* map rss type to rss hash flag */
+	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
+			0, 0, 0, PKT_RX_RSS_HASH,
+			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
+			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);
+
+	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
+	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
+	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
+	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);
+
+	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
+	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
+	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);
+
+	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
+	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);
+	vtag1 = _mm_and_si128(vtag1, pkttype_msk);
+
+	vtag1 = _mm_or_si128(ptype0, vtag1);
+	vol.dword = _mm_cvtsi128_si64(vtag1);
+
+	rx_pkts[0]->ol_flags = vol.e[0];
+	rx_pkts[1]->ol_flags = vol.e[1];
+	rx_pkts[2]->ol_flags = vol.e[2];
+	rx_pkts[3]->ol_flags = vol.e[3];
+}
+#else
+#define desc_to_olflags_v(desc, rx_pkts) do {} while (0)
+#endif
+
+/*
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ * - don't support ol_flags for rss and csum err
+ */
+static inline uint16_t
+_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts, uint8_t *split_packet)
+{
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+	uint64_t var;
+	__m128i shuf_msk;
+	__m128i crc_adjust = _mm_set_epi16(
+				0, 0, 0,    /* ignore non-length fields */
+				-rxq->crc_len, /* sub crc on data_len */
+				0,          /* ignore high-16bits of pkt_len */
+				-rxq->crc_len, /* sub crc on pkt_len */
+				0, 0            /* ignore pkt_type field */
+			);
+	__m128i dd_check, eop_check;
+
+	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
+	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
+
+	/* Just the act of getting into the function from the application is
+	 * going to cost about 7 cycles */
+	rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	_mm_prefetch((const void *)rxdp, _MM_HINT_T0);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act */
+	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
+		ixgbe_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available */
+	if (!(rxdp->wb.upper.status_error &
+				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
+		return 0;
+
+	/* 4 packets DD mask */
+	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);
+
+	/* 4 packets EOP mask */
+	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);
+
+	/* mask to shuffle from desc. to mbuf */
+	shuf_msk = _mm_set_epi8(
+		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
+		15, 14,      /* octet 14~15, low 16 bits vlan_macip */
+		13, 12,      /* octet 12~13, 16 bits data_len */
+		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
+		13, 12,      /* octet 12~13, low 16 bits pkt_len */
+		0xFF, 0xFF,  /* skip 32 bit pkt_type */
+		0xFF, 0xFF
+		);
+
+	/* Cache is empty -> need to scan the buffer rings, but first move
+	 * the next 'n' mbufs into the cache */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packet in one loop
+	 * [A*. mask out 4 unused dirty field in desc]
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. calc the number of DD bits among the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info. from desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+			pos += RTE_IXGBE_DESCS_PER_LOOP,
+			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
+		__m128i descs[RTE_IXGBE_DESCS_PER_LOOP];
+		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
+		__m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */
+
+		/* B.1 load 1 mbuf point */
+		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);
+
+		/* Read desc statuses backwards to avoid race condition */
+		/* A.1 load 4 pkts desc */
+		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);
+
+		/* B.1 load 1 mbuf point */
+		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);
+
+		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
+		/* B.1 load 2 mbuf point */
+		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
+		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
+
+		if (split_packet) {
+			rte_prefetch0(&rx_pkts[pos]->cacheline1);
+			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
+			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
+			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
+		}
+
+		/* avoid compiler reorder optimization */
+		rte_compiler_barrier();
+
+		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
+		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
+
+		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
+		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);
+
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
+
+		/* set ol_flags with vlan packet type */
+		desc_to_olflags_v(descs, &rx_pkts[pos]);
+
+		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
+		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
+		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
+
+		/* C.2 get 4 pkts staterr value  */
+		zero = _mm_xor_si128(dd_check, dd_check);
+		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
+
+		/* D.3 copy final 3,4 data to rx_pkts */
+		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
+				pkt_mb4);
+		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
+				pkt_mb3);
+
+		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
+		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
+
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			__m128i eop_shuf_mask = _mm_set_epi8(
+					0xFF, 0xFF, 0xFF, 0xFF,
+					0xFF, 0xFF, 0xFF, 0xFF,
+					0xFF, 0xFF, 0xFF, 0xFF,
+					0x04, 0x0C, 0x00, 0x08
+					);
+
+			/* and with mask to extract bits, flipping 1-0 */
+			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
+			/* the staterr values are not in order, as the count
+			 * count of dd bits doesn't care. However, for end of
+			 * packet tracking, we do care, so shuffle. This also
+			 * compresses the 32-bit values to 8-bit */
+			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
+			/* store the resulting 32-bit value */
+			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
+			split_packet += RTE_IXGBE_DESCS_PER_LOOP;
+
+			/* zero-out next pointers */
+			rx_pkts[pos]->next = NULL;
+			rx_pkts[pos + 1]->next = NULL;
+			rx_pkts[pos + 2]->next = NULL;
+			rx_pkts[pos + 3]->next = NULL;
+		}
+
+		/* C.3 calc available number of desc */
+		staterr = _mm_and_si128(staterr, dd_check);
+		staterr = _mm_packs_epi32(staterr, zero);
+
+		/* D.3 copy final 1,2 data to rx_pkts */
+		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
+				pkt_mb2);
+		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
+				pkt_mb1);
+
+		/* C.4 calc avaialbe number of desc */
+		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
+		nb_pkts_recd += var;
+		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+			break;
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
+static inline void
+vtx1(volatile union ixgbe_adv_tx_desc *txdp,
+		struct rte_mbuf *pkt, uint64_t flags)
+{
+	__m128i descriptor = _mm_set_epi64x((uint64_t)pkt->pkt_len << 46 |
+			flags | pkt->data_len,
+			pkt->buf_physaddr + pkt->data_off);
+	_mm_store_si128((__m128i *)&txdp->read, descriptor);
+}
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH 2/4] ixgbe: implement vector PMD for arm architecture
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
@ 2016-04-20 13:45 ` Jianbo Liu
  2016-04-20 13:45 ` [dpdk-dev] [PATCH 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-20 13:45 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

use ARM NEON intrinsic to implement ixgbe vPMD

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec.c      |   4 +
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.h | 371 ++++++++++++++++++++++++++++++++
 2 files changed, 375 insertions(+)
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.h

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index 064a00b..9fcc956 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -38,7 +38,11 @@
 #include "ixgbe_ethdev.h"
 #include "ixgbe_rxtx.h"
 
+#ifdef RTE_ARCH_ARM64
+#include "ixgbe_rxtx_vec_neon.h"
+#else
 #include "ixgbe_rxtx_vec_sse.h"
+#endif
 
 /*
  * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.h b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.h
new file mode 100644
index 0000000..2f1e1ce
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.h
@@ -0,0 +1,371 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+
+#include "ixgbe_ethdev.h"
+#include "ixgbe_rxtx.h"
+
+#include <arm_neon.h>
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+static inline void
+ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mbuf *mb0, *mb1;
+	uint64x2_t dma_addr0, dma_addr1;
+	uint64x2_t zero = vdupq_n_u64(0);
+	uint64_t paddr;
+	uint8x8_t p;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (unlikely(rte_mempool_get_bulk(rxq->mb_pool,
+					  (void *)rxep,
+					  RTE_IXGBE_RXQ_REARM_THRESH) < 0)) {
+		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
+				rxep[i].mbuf = &rxq->fake_mbuf;
+				vst1q_u64((uint64_t *)&rxdp[i].read,
+					  zero);
+			}
+		}
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_IXGBE_RXQ_REARM_THRESH;
+		return;
+	}
+
+	p = vld1_u8((uint8_t *)&rxq->mbuf_initializer);
+
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		/*
+		 * Flush mbuf with pkt template.
+		 * Data to be rearmed is 6 bytes long.
+		 * Though, RX will overwrite ol_flags that are coming next
+		 * anyway. So overwrite whole 8 bytes with one load:
+		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
+		 */
+		vst1_u8((uint8_t *)&mb0->rearm_data, p);
+		paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
+		dma_addr0 = vsetq_lane_u64(paddr, zero, 0);
+		/* flush desc with pa dma_addr */
+		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+
+		vst1_u8((uint8_t *)&mb1->rearm_data, p);
+		paddr = mb1->buf_physaddr + RTE_PKTMBUF_HEADROOM;
+		dma_addr1 = vsetq_lane_u64(paddr, zero, 0);
+		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
+	}
+
+	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+}
+
+/* Handling the offload flags (olflags) field takes computation
+ * time when receiving packets. Therefore we provide a flag to disable
+ * the processing of the olflags field when they are not needed. This
+ * gives improved performance, at the cost of losing the offload info
+ * in the received packet
+ */
+#ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
+
+#define VTAG_SHIFT     (3)
+
+static inline void
+desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
+		  uint8x16_t staterr, struct rte_mbuf **rx_pkts)
+{
+	uint8x16_t ptype;
+	uint8x16_t vtag;
+
+	union {
+		uint8_t e[4];
+		uint32_t word;
+	} vol;
+
+	const uint8x16_t pkttype_msk = {
+			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
+			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00};
+
+	const uint8x16_t rsstype_msk = {
+			0x0F, 0x0F, 0x0F, 0x0F,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00};
+
+	const uint8x16_t rss_flags = {
+			0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+			0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH,
+			PKT_RX_RSS_HASH, 0, 0, 0,
+			0, 0, 0, PKT_RX_FDIR};
+
+	ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0];
+	ptype = vandq_u8(ptype, rsstype_msk);
+	ptype = vqtbl1q_u8(rss_flags, ptype);
+
+	vtag = vshrq_n_u8(staterr, VTAG_SHIFT);
+	vtag = vandq_u8(vtag, pkttype_msk);
+	vtag = vorrq_u8(ptype, vtag);
+
+	vol.word = vgetq_lane_u32(vreinterpretq_u32_u8(vtag), 0);
+
+	rx_pkts[0]->ol_flags = vol.e[0];
+	rx_pkts[1]->ol_flags = vol.e[1];
+	rx_pkts[2]->ol_flags = vol.e[2];
+	rx_pkts[3]->ol_flags = vol.e[3];
+}
+#else
+#define desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, rx_pkts)
+#endif
+
+/*
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ * - don't support ol_flags for rss and csum err
+ */
+
+#define IXGBE_VPMD_DESC_DD_MASK		0x01010101
+#define IXGBE_VPMD_DESC_EOP_MASK	0x02020202
+
+static inline uint16_t
+_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+	uint64_t var;
+	uint8x16_t shuf_msk = {
+		0xFF, 0xFF,
+		0xFF, 0xFF,  /* skip 32 bits pkt_type */
+		12, 13,      /* octet 12~13, low 16 bits pkt_len */
+		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
+		12, 13,      /* octet 12~13, 16 bits data_len */
+		14, 15,      /* octet 14~15, low 16 bits vlan_macip */
+		4, 5, 6, 7  /* octet 4~7, 32bits rss */
+		};
+	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
+				 rxq->crc_len, 0, 0, 0};
+
+	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
+	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
+
+	/* Just the act of getting into the function from the application is
+	 * going to cost about 7 cycles */
+	rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch_non_temporal(rxdp);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act */
+	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
+		ixgbe_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available */
+	if (!(rxdp->wb.upper.status_error &
+				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
+		return 0;
+
+	/* Cache is empty -> need to scan the buffer rings, but first move
+	 * the next 'n' mbufs into the cache */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packet in one loop
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. calc the number of DD bits among the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info. from desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+			pos += RTE_IXGBE_DESCS_PER_LOOP,
+			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
+		uint64x2_t descs[RTE_IXGBE_DESCS_PER_LOOP];
+		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		uint8x16x2_t sterr_tmp1, sterr_tmp2;
+		uint64x2_t mbp1, mbp2;
+		uint8x16_t staterr;
+		uint16x8_t tmp;
+		uint32_t stat;
+
+		/* B.1 load 1 mbuf point */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+		/* Read desc statuses backwards to avoid race condition */
+		/* A.1 load 4 pkts desc */
+		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
+		rte_rmb();
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+		/* B.1 load 1 mbuf point */
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
+		/* B.1 load 2 mbuf point */
+		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
+		descs[0] =  vld1q_u64((uint64_t *)(rxdp));
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		if (split_packet) {
+			rte_prefetch_non_temporal(&rx_pkts[pos]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+1]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+2]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+3]->cacheline1);
+		}
+
+		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+
+		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]),
+				      vreinterpretq_u8_u64(descs[3]));
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]),
+				      vreinterpretq_u8_u64(descs[2]));
+
+		/* C.2 get 4 pkts staterr value  */
+		staterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0];
+		stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
+
+		/* set ol_flags with vlan packet type */
+		desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr,
+				  &rx_pkts[pos]);
+
+		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+		pkt_mb4 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+
+		/* D.3 copy final 3,4 data to rx_pkts */
+		vst1q_u8((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
+			 pkt_mb4);
+		vst1q_u8((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
+			 pkt_mb3);
+
+		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			/* and with mask to extract bits, flipping 1-0 */
+			*(int *)split_packet = ~stat & IXGBE_VPMD_DESC_EOP_MASK;
+
+			split_packet += RTE_IXGBE_DESCS_PER_LOOP;
+
+			/* zero-out next pointers */
+			rx_pkts[pos]->next = NULL;
+			rx_pkts[pos + 1]->next = NULL;
+			rx_pkts[pos + 2]->next = NULL;
+			rx_pkts[pos + 3]->next = NULL;
+		}
+
+		rte_prefetch_non_temporal(rxdp + RTE_IXGBE_DESCS_PER_LOOP);
+
+		/* D.3 copy final 1,2 data to rx_pkts */
+		vst1q_u8((uint8_t *)&rx_pkts[pos+1]->rx_descriptor_fields1,
+			 pkt_mb2);
+		vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
+			 pkt_mb1);
+
+		/* C.4 calc avaialbe number of desc */
+		var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
+		nb_pkts_recd += var;
+		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+			break;
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
+static inline void
+vtx1(volatile union ixgbe_adv_tx_desc *txdp,
+	struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64x2_t descriptor = {
+			pkt->buf_physaddr + pkt->data_off,
+			(uint64_t)pkt->pkt_len << 46 | flags | pkt->data_len};
+
+	vst1q_u64((uint64_t *)&txdp->read, descriptor);
+}
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
  2016-04-20 13:45 ` [dpdk-dev] [PATCH 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
@ 2016-04-20 13:45 ` Jianbo Liu
  2016-04-20 13:45 ` [dpdk-dev] [PATCH 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-20 13:45 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 config/defconfig_arm64-armv8a-linuxapp-gcc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index 9abeca4..98cc054 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -42,7 +42,6 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 CONFIG_RTE_TOOLCHAIN="gcc"
 CONFIG_RTE_TOOLCHAIN_GCC=y
 
-CONFIG_RTE_IXGBE_INC_VECTOR=n
 CONFIG_RTE_LIBRTE_IVSHMEM=n
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
  2016-04-20 13:45 ` [dpdk-dev] [PATCH 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
  2016-04-20 13:45 ` [dpdk-dev] [PATCH 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
@ 2016-04-20 13:45 ` Jianbo Liu
  2016-04-25 16:35 ` [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Bruce Richardson
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-20 13:45 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1953ea2..07a9a44 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -142,6 +142,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.h
 
 EZchip TILE-Gx
 M: Zhigang Lu <zlu@ezchip.com>
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
                   ` (2 preceding siblings ...)
  2016-04-20 13:45 ` [dpdk-dev] [PATCH 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
@ 2016-04-25 16:35 ` Bruce Richardson
  2016-04-26  8:23   ` Jianbo Liu
  2016-04-26 13:50 ` [dpdk-dev] [PATCH v2 " Jianbo Liu
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Bruce Richardson @ 2016-04-25 16:35 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, jerin.jacob, helin.zhang, konstantin.ananyev

On Wed, Apr 20, 2016 at 09:44:59PM +0800, Jianbo Liu wrote:
> move SSE-dependent code to new file "ixgbe_rxtx_vec_sse.h"
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  drivers/net/ixgbe/ixgbe_rxtx_vec.c     | 369 +----------------------------
>  drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h | 408 +++++++++++++++++++++++++++++++++
>  2 files changed, 409 insertions(+), 368 deletions(-)
>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h
> 
Hi Jianbo,

functionally I've given this a quick sanity test and see no issues with performance
on the x86(_64) side of things.

However, in terms of how the driver split in done in this set of patches, I think
it might be better to reverse what goes in the header files and in the .c files.
Rather than having the common code in the .c file and the arch specific code in
the header file, I think the common code should be in a header file and the
arch specific code in a .c file.

The reason for this is the need for possibly different compiler flags to be
passed for the vector drivers from the makefile e.g. as is done by my patchset
for i40e [http://dpdk.org/dev/patchwork/patch/12082/]. This would be a bit more
awkward if that one C file is shared by multiple architectures, as we'd have
architecture specific branches in both makefile and C file. As well as that,
the possibility exists of multiple vector drivers for one architecture, e.g.
an SSE and AVX driver for x86_64 with selection of code patch at runtime as done
by the ACL library. In that case, you want multiple vector code paths compiled
with different CFLAG overrides, which necessitates different C files.

Therefore, I think using a C file per instruction set/architecture, rather than
a header file per arch may be more expandable in future.

Regards,
/Bruce

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86
  2016-04-25 16:35 ` [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Bruce Richardson
@ 2016-04-26  8:23   ` Jianbo Liu
  0 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-26  8:23 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev, Jerin Jacob, Zhang, Helin, Ananyev, Konstantin

On 26 April 2016 at 00:35, Bruce Richardson <bruce.richardson@intel.com> wrote:
> On Wed, Apr 20, 2016 at 09:44:59PM +0800, Jianbo Liu wrote:
>> move SSE-dependent code to new file "ixgbe_rxtx_vec_sse.h"
>>
>> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
>> ---
>>  drivers/net/ixgbe/ixgbe_rxtx_vec.c     | 369 +----------------------------
>>  drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h | 408 +++++++++++++++++++++++++++++++++
>>  2 files changed, 409 insertions(+), 368 deletions(-)
>>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.h
>>
> Hi Jianbo,
>
> functionally I've given this a quick sanity test and see no issues with performance
> on the x86(_64) side of things.
>
> However, in terms of how the driver split in done in this set of patches, I think
> it might be better to reverse what goes in the header files and in the .c files.
> Rather than having the common code in the .c file and the arch specific code in
> the header file, I think the common code should be in a header file and the
> arch specific code in a .c file.
>
> The reason for this is the need for possibly different compiler flags to be
> passed for the vector drivers from the makefile e.g. as is done by my patchset
> for i40e [http://dpdk.org/dev/patchwork/patch/12082/]. This would be a bit more
> awkward if that one C file is shared by multiple architectures, as we'd have
> architecture specific branches in both makefile and C file. As well as that,
> the possibility exists of multiple vector drivers for one architecture, e.g.
> an SSE and AVX driver for x86_64 with selection of code patch at runtime as done
> by the ACL library. In that case, you want multiple vector code paths compiled
> with different CFLAG overrides, which necessitates different C files.
>
> Therefore, I think using a C file per instruction set/architecture, rather than
> a header file per arch may be more expandable in future.
>

Good suggestion. I will submit v2 later.

Thanks!
Jianbo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2 1/4] ixgbe: rearrange vector PMD code for x86
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
                   ` (3 preceding siblings ...)
  2016-04-25 16:35 ` [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Bruce Richardson
@ 2016-04-26 13:50 ` Jianbo Liu
  2016-05-03  5:51   ` Jianbo Liu
  2016-05-03 16:29   ` Bruce Richardson
  2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
                   ` (3 subsequent siblings)
  8 siblings, 2 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-26 13:50 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

move common code to new file "ixgbe_rxtx_vec_common.h",
and vPMD for x86 is implemented in ixgbe_rxtx_vec.c

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
Suggested-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec.c        | 256 +----------------------
 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 325 ++++++++++++++++++++++++++++++
 2 files changed, 333 insertions(+), 248 deletions(-)
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index 5040704..b704a57 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -37,6 +37,7 @@
 
 #include "ixgbe_ethdev.h"
 #include "ixgbe_rxtx.h"
+#include "ixgbe_rxtx_vec_common.h"
 
 #include <tmmintrin.h>
 
@@ -414,69 +415,6 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-static inline uint16_t
-reassemble_packets(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_bufs,
-		uint16_t nb_bufs, uint8_t *split_flags)
-{
-	struct rte_mbuf *pkts[nb_bufs]; /*finished pkts*/
-	struct rte_mbuf *start = rxq->pkt_first_seg;
-	struct rte_mbuf *end =  rxq->pkt_last_seg;
-	unsigned pkt_idx, buf_idx;
-
-	for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {
-		if (end != NULL) {
-			/* processing a split packet */
-			end->next = rx_bufs[buf_idx];
-			rx_bufs[buf_idx]->data_len += rxq->crc_len;
-
-			start->nb_segs++;
-			start->pkt_len += rx_bufs[buf_idx]->data_len;
-			end = end->next;
-
-			if (!split_flags[buf_idx]) {
-				/* it's the last packet of the set */
-				start->hash = end->hash;
-				start->ol_flags = end->ol_flags;
-				/* we need to strip crc for the whole packet */
-				start->pkt_len -= rxq->crc_len;
-				if (end->data_len > rxq->crc_len)
-					end->data_len -= rxq->crc_len;
-				else {
-					/* free up last mbuf */
-					struct rte_mbuf *secondlast = start;
-
-					start->nb_segs--;
-					while (secondlast->next != end)
-						secondlast = secondlast->next;
-					secondlast->data_len -= (rxq->crc_len -
-							end->data_len);
-					secondlast->next = NULL;
-					rte_pktmbuf_free_seg(end);
-					end = secondlast;
-				}
-				pkts[pkt_idx++] = start;
-				start = end = NULL;
-			}
-		} else {
-			/* not processing a split packet */
-			if (!split_flags[buf_idx]) {
-				/* not a split packet, save and skip */
-				pkts[pkt_idx++] = rx_bufs[buf_idx];
-				continue;
-			}
-			end = start = rx_bufs[buf_idx];
-			rx_bufs[buf_idx]->data_len += rxq->crc_len;
-			rx_bufs[buf_idx]->pkt_len += rxq->crc_len;
-		}
-	}
-
-	/* save the partial packet for next time */
-	rxq->pkt_first_seg = start;
-	rxq->pkt_last_seg = end;
-	memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));
-	return pkt_idx;
-}
-
 /*
  * vPMD receive routine that reassembles scattered packets
  *
@@ -539,72 +477,6 @@ vtx(volatile union ixgbe_adv_tx_desc *txdp,
 		vtx1(txdp, *pkt, flags);
 }
 
-static inline int __attribute__((always_inline))
-ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
-{
-	struct ixgbe_tx_entry_v *txep;
-	uint32_t status;
-	uint32_t n;
-	uint32_t i;
-	int nb_free = 0;
-	struct rte_mbuf *m, *free[RTE_IXGBE_TX_MAX_FREE_BUF_SZ];
-
-	/* check DD bit on threshold descriptor */
-	status = txq->tx_ring[txq->tx_next_dd].wb.status;
-	if (!(status & IXGBE_ADVTXD_STAT_DD))
-		return 0;
-
-	n = txq->tx_rs_thresh;
-
-	/*
-	 * first buffer to free from S/W ring is at index
-	 * tx_next_dd - (tx_rs_thresh-1)
-	 */
-	txep = &txq->sw_ring_v[txq->tx_next_dd - (n - 1)];
-	m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
-	if (likely(m != NULL)) {
-		free[0] = m;
-		nb_free = 1;
-		for (i = 1; i < n; i++) {
-			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
-			if (likely(m != NULL)) {
-				if (likely(m->pool == free[0]->pool))
-					free[nb_free++] = m;
-				else {
-					rte_mempool_put_bulk(free[0]->pool,
-							(void *)free, nb_free);
-					free[0] = m;
-					nb_free = 1;
-				}
-			}
-		}
-		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
-	} else {
-		for (i = 1; i < n; i++) {
-			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
-			if (m != NULL)
-				rte_mempool_put(m->pool, m);
-		}
-	}
-
-	/* buffers were freed, update counters */
-	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
-	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
-	if (txq->tx_next_dd >= txq->nb_tx_desc)
-		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
-
-	return txq->tx_rs_thresh;
-}
-
-static inline void __attribute__((always_inline))
-tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
-		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
-{
-	int i;
-	for (i = 0; i < (int)nb_pkts; ++i)
-		txep[i].mbuf = tx_pkts[i];
-}
-
 uint16_t
 ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 		       uint16_t nb_pkts)
@@ -675,91 +547,25 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 static void __attribute__((cold))
 ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
 {
-	unsigned i;
-	struct ixgbe_tx_entry_v *txe;
-	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
-
-	if (txq->sw_ring == NULL || txq->nb_tx_free == max_desc)
-		return;
-
-	/* release the used mbufs in sw_ring */
-	for (i = txq->tx_next_dd - (txq->tx_rs_thresh - 1);
-	     i != txq->tx_tail;
-	     i = (i + 1) & max_desc) {
-		txe = &txq->sw_ring_v[i];
-		rte_pktmbuf_free_seg(txe->mbuf);
-	}
-	txq->nb_tx_free = max_desc;
-
-	/* reset tx_entry */
-	for (i = 0; i < txq->nb_tx_desc; i++) {
-		txe = &txq->sw_ring_v[i];
-		txe->mbuf = NULL;
-	}
+	_ixgbe_tx_queue_release_mbufs_vec(txq);
 }
 
 void __attribute__((cold))
 ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
 {
-	const unsigned mask = rxq->nb_rx_desc - 1;
-	unsigned i;
-
-	if (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_rx_desc)
-		return;
-
-	/* free all mbufs that are valid in the ring */
-	for (i = rxq->rx_tail; i != rxq->rxrearm_start; i = (i + 1) & mask)
-		rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
-	rxq->rxrearm_nb = rxq->nb_rx_desc;
-
-	/* set all entries to NULL */
-	memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);
+	_ixgbe_rx_queue_release_mbufs_vec(rxq);
 }
 
 static void __attribute__((cold))
 ixgbe_tx_free_swring(struct ixgbe_tx_queue *txq)
 {
-	if (txq == NULL)
-		return;
-
-	if (txq->sw_ring != NULL) {
-		rte_free(txq->sw_ring_v - 1);
-		txq->sw_ring_v = NULL;
-	}
+	_ixgbe_tx_free_swring_vec(txq);
 }
 
 static void __attribute__((cold))
 ixgbe_reset_tx_queue(struct ixgbe_tx_queue *txq)
 {
-	static const union ixgbe_adv_tx_desc zeroed_desc = {{0}};
-	struct ixgbe_tx_entry_v *txe = txq->sw_ring_v;
-	uint16_t i;
-
-	/* Zero out HW ring memory */
-	for (i = 0; i < txq->nb_tx_desc; i++)
-		txq->tx_ring[i] = zeroed_desc;
-
-	/* Initialize SW ring entries */
-	for (i = 0; i < txq->nb_tx_desc; i++) {
-		volatile union ixgbe_adv_tx_desc *txd = &txq->tx_ring[i];
-		txd->wb.status = IXGBE_TXD_STAT_DD;
-		txe[i].mbuf = NULL;
-	}
-
-	txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
-	txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
-
-	txq->tx_tail = 0;
-	txq->nb_tx_used = 0;
-	/*
-	 * Always allow 1 descriptor to be un-allocated to avoid
-	 * a H/W race condition
-	 */
-	txq->last_desc_cleaned = (uint16_t)(txq->nb_tx_desc - 1);
-	txq->nb_tx_free = (uint16_t)(txq->nb_tx_desc - 1);
-	txq->ctx_curr = 0;
-	memset((void *)&txq->ctx_cache, 0,
-		IXGBE_CTX_NUM * sizeof(struct ixgbe_advctx_info));
+	_ixgbe_reset_tx_queue_vec(txq);
 }
 
 static const struct ixgbe_txq_ops vec_txq_ops = {
@@ -771,63 +577,17 @@ static const struct ixgbe_txq_ops vec_txq_ops = {
 int __attribute__((cold))
 ixgbe_rxq_vec_setup(struct ixgbe_rx_queue *rxq)
 {
-	uintptr_t p;
-	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
-
-	mb_def.nb_segs = 1;
-	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
-	mb_def.port = rxq->port_id;
-	rte_mbuf_refcnt_set(&mb_def, 1);
-
-	/* prevent compiler reordering: rearm_data covers previous fields */
-	rte_compiler_barrier();
-	p = (uintptr_t)&mb_def.rearm_data;
-	rxq->mbuf_initializer = *(uint64_t *)p;
-	return 0;
+	return ixgbe_rxq_vec_setup_default(rxq);
 }
 
 int __attribute__((cold))
 ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq)
 {
-	if (txq->sw_ring_v == NULL)
-		return -1;
-
-	/* leave the first one for overflow */
-	txq->sw_ring_v = txq->sw_ring_v + 1;
-	txq->ops = &vec_txq_ops;
-
-	return 0;
+	return ixgbe_txq_vec_setup_default(txq, &vec_txq_ops);
 }
 
 int __attribute__((cold))
 ixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
 {
-#ifndef RTE_LIBRTE_IEEE1588
-	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
-	struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
-
-#ifndef RTE_IXGBE_RX_OLFLAGS_ENABLE
-	/* whithout rx ol_flags, no VP flag report */
-	if (rxmode->hw_vlan_strip != 0 ||
-	    rxmode->hw_vlan_extend != 0)
-		return -1;
-#endif
-
-	/* no fdir support */
-	if (fconf->mode != RTE_FDIR_MODE_NONE)
-		return -1;
-
-	/*
-	 * - no csum error report support
-	 * - no header split support
-	 */
-	if (rxmode->hw_ip_checksum == 1 ||
-	    rxmode->header_split == 1)
-		return -1;
-
-	return 0;
-#else
-	RTE_SET_USED(dev);
-	return -1;
-#endif
+	return ixgbe_rx_vec_dev_conf_condition_check_default(dev);
 }
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
new file mode 100644
index 0000000..e664439
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
@@ -0,0 +1,325 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IXGBE_RXTX_VEC_COMMON_H_
+#define _IXGBE_RXTX_VEC_COMMON_H_
+#include <stdint.h>
+#include <rte_ethdev.h>
+
+#include "ixgbe_ethdev.h"
+#include "ixgbe_rxtx.h"
+
+static inline uint16_t
+reassemble_packets(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_bufs,
+		uint16_t nb_bufs, uint8_t *split_flags)
+{
+	struct rte_mbuf *pkts[nb_bufs]; /*finished pkts*/
+	struct rte_mbuf *start = rxq->pkt_first_seg;
+	struct rte_mbuf *end =  rxq->pkt_last_seg;
+	unsigned pkt_idx, buf_idx;
+
+	for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {
+		if (end != NULL) {
+			/* processing a split packet */
+			end->next = rx_bufs[buf_idx];
+			rx_bufs[buf_idx]->data_len += rxq->crc_len;
+
+			start->nb_segs++;
+			start->pkt_len += rx_bufs[buf_idx]->data_len;
+			end = end->next;
+
+			if (!split_flags[buf_idx]) {
+				/* it's the last packet of the set */
+				start->hash = end->hash;
+				start->ol_flags = end->ol_flags;
+				/* we need to strip crc for the whole packet */
+				start->pkt_len -= rxq->crc_len;
+				if (end->data_len > rxq->crc_len)
+					end->data_len -= rxq->crc_len;
+				else {
+					/* free up last mbuf */
+					struct rte_mbuf *secondlast = start;
+
+					start->nb_segs--;
+					while (secondlast->next != end)
+						secondlast = secondlast->next;
+					secondlast->data_len -= (rxq->crc_len -
+							end->data_len);
+					secondlast->next = NULL;
+					rte_pktmbuf_free_seg(end);
+					end = secondlast;
+				}
+				pkts[pkt_idx++] = start;
+				start = end = NULL;
+			}
+		} else {
+			/* not processing a split packet */
+			if (!split_flags[buf_idx]) {
+				/* not a split packet, save and skip */
+				pkts[pkt_idx++] = rx_bufs[buf_idx];
+				continue;
+			}
+			end = start = rx_bufs[buf_idx];
+			rx_bufs[buf_idx]->data_len += rxq->crc_len;
+			rx_bufs[buf_idx]->pkt_len += rxq->crc_len;
+		}
+	}
+
+	/* save the partial packet for next time */
+	rxq->pkt_first_seg = start;
+	rxq->pkt_last_seg = end;
+	memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));
+	return pkt_idx;
+}
+
+static inline int __attribute__((always_inline))
+ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
+{
+	struct ixgbe_tx_entry_v *txep;
+	uint32_t status;
+	uint32_t n;
+	uint32_t i;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[RTE_IXGBE_TX_MAX_FREE_BUF_SZ];
+
+	/* check DD bit on threshold descriptor */
+	status = txq->tx_ring[txq->tx_next_dd].wb.status;
+	if (!(status & IXGBE_ADVTXD_STAT_DD))
+		return 0;
+
+	n = txq->tx_rs_thresh;
+
+	/*
+	 * first buffer to free from S/W ring is at index
+	 * tx_next_dd - (tx_rs_thresh-1)
+	 */
+	txep = &txq->sw_ring_v[txq->tx_next_dd - (n - 1)];
+	m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
+	if (likely(m != NULL)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < n; i++) {
+			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (likely(m != NULL)) {
+				if (likely(m->pool == free[0]->pool))
+					free[nb_free++] = m;
+				else {
+					rte_mempool_put_bulk(free[0]->pool,
+							(void *)free, nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < n; i++) {
+			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (m != NULL)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+	/* buffers were freed, update counters */
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+	if (txq->tx_next_dd >= txq->nb_tx_desc)
+		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+	return txq->tx_rs_thresh;
+}
+
+static inline void __attribute__((always_inline))
+tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
+		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	int i;
+	for (i = 0; i < (int)nb_pkts; ++i)
+		txep[i].mbuf = tx_pkts[i];
+}
+
+static inline void
+_ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
+{
+	unsigned i;
+	struct ixgbe_tx_entry_v *txe;
+	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
+
+	if (txq->sw_ring == NULL || txq->nb_tx_free == max_desc)
+		return;
+
+	/* release the used mbufs in sw_ring */
+	for (i = txq->tx_next_dd - (txq->tx_rs_thresh - 1);
+	     i != txq->tx_tail;
+	     i = (i + 1) & max_desc) {
+		txe = &txq->sw_ring_v[i];
+		rte_pktmbuf_free_seg(txe->mbuf);
+	}
+	txq->nb_tx_free = max_desc;
+
+	/* reset tx_entry */
+	for (i = 0; i < txq->nb_tx_desc; i++) {
+		txe = &txq->sw_ring_v[i];
+		txe->mbuf = NULL;
+	}
+}
+
+static inline void
+_ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
+{
+	const unsigned mask = rxq->nb_rx_desc - 1;
+	unsigned i;
+
+	if (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_rx_desc)
+		return;
+
+	/* free all mbufs that are valid in the ring */
+	for (i = rxq->rx_tail; i != rxq->rxrearm_start; i = (i + 1) & mask)
+		rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+	rxq->rxrearm_nb = rxq->nb_rx_desc;
+
+	/* set all entries to NULL */
+	memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);
+}
+
+static inline void
+_ixgbe_tx_free_swring_vec(struct ixgbe_tx_queue *txq)
+{
+	if (txq == NULL)
+		return;
+
+	if (txq->sw_ring != NULL) {
+		rte_free(txq->sw_ring_v - 1);
+		txq->sw_ring_v = NULL;
+	}
+}
+
+static inline void
+_ixgbe_reset_tx_queue_vec(struct ixgbe_tx_queue *txq)
+{
+	static const union ixgbe_adv_tx_desc zeroed_desc = {{0}};
+	struct ixgbe_tx_entry_v *txe = txq->sw_ring_v;
+	uint16_t i;
+
+	/* Zero out HW ring memory */
+	for (i = 0; i < txq->nb_tx_desc; i++)
+		txq->tx_ring[i] = zeroed_desc;
+
+	/* Initialize SW ring entries */
+	for (i = 0; i < txq->nb_tx_desc; i++) {
+		volatile union ixgbe_adv_tx_desc *txd = &txq->tx_ring[i];
+		txd->wb.status = IXGBE_TXD_STAT_DD;
+		txe[i].mbuf = NULL;
+	}
+
+	txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+	txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+	txq->tx_tail = 0;
+	txq->nb_tx_used = 0;
+	/*
+	 * Always allow 1 descriptor to be un-allocated to avoid
+	 * a H/W race condition
+	 */
+	txq->last_desc_cleaned = (uint16_t)(txq->nb_tx_desc - 1);
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_desc - 1);
+	txq->ctx_curr = 0;
+	memset((void *)&txq->ctx_cache, 0,
+		IXGBE_CTX_NUM * sizeof(struct ixgbe_advctx_info));
+}
+
+static inline int
+ixgbe_rxq_vec_setup_default(struct ixgbe_rx_queue *rxq)
+{
+	uintptr_t p;
+	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+	mb_def.nb_segs = 1;
+	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+	mb_def.port = rxq->port_id;
+	rte_mbuf_refcnt_set(&mb_def, 1);
+
+	/* prevent compiler reordering: rearm_data covers previous fields */
+	rte_compiler_barrier();
+	p = (uintptr_t)&mb_def.rearm_data;
+	rxq->mbuf_initializer = *(uint64_t *)p;
+	return 0;
+}
+
+static inline int
+ixgbe_txq_vec_setup_default(struct ixgbe_tx_queue *txq,
+			    const struct ixgbe_txq_ops *txq_ops)
+{
+	if (txq->sw_ring_v == NULL)
+		return -1;
+
+	/* leave the first one for overflow */
+	txq->sw_ring_v = txq->sw_ring_v + 1;
+	txq->ops = txq_ops;
+
+	return 0;
+}
+
+static inline int
+ixgbe_rx_vec_dev_conf_condition_check_default(struct rte_eth_dev *dev)
+{
+#ifndef RTE_LIBRTE_IEEE1588
+	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+	struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
+
+#ifndef RTE_IXGBE_RX_OLFLAGS_ENABLE
+	/* whithout rx ol_flags, no VP flag report */
+	if (rxmode->hw_vlan_strip != 0 ||
+	    rxmode->hw_vlan_extend != 0)
+		return -1;
+#endif
+
+	/* no fdir support */
+	if (fconf->mode != RTE_FDIR_MODE_NONE)
+		return -1;
+
+	/*
+	 * - no csum error report support
+	 * - no header split support
+	 */
+	if (rxmode->hw_ip_checksum == 1 ||
+	    rxmode->header_split == 1)
+		return -1;
+
+	return 0;
+#else
+	RTE_SET_USED(dev);
+	return -1;
+#endif
+}
+#endif
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2 2/4] ixgbe: implement vector PMD for arm architecture
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
                   ` (4 preceding siblings ...)
  2016-04-26 13:50 ` [dpdk-dev] [PATCH v2 " Jianbo Liu
@ 2016-04-26 13:55 ` Jianbo Liu
  2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-26 13:55 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

use ARM NEON intrinsic to implement ixgbe vPMD

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 drivers/net/ixgbe/Makefile              |   4 +
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 556 ++++++++++++++++++++++++++++++++
 2 files changed, 560 insertions(+)
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c

diff --git a/drivers/net/ixgbe/Makefile b/drivers/net/ixgbe/Makefile
index 50bf51c..b1c7a60 100644
--- a/drivers/net/ixgbe/Makefile
+++ b/drivers/net/ixgbe/Makefile
@@ -108,7 +108,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_fdir.c
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_pf.c
+ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
+SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec_neon.c
+else
 SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec.c
+endif
 
 ifeq ($(CONFIG_RTE_NIC_BYPASS),y)
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_bypass.c
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
new file mode 100644
index 0000000..2d63490
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -0,0 +1,556 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+
+#include "ixgbe_ethdev.h"
+#include "ixgbe_rxtx.h"
+#include "ixgbe_rxtx_vec_common.h"
+
+#include <arm_neon.h>
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+static inline void
+ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mbuf *mb0, *mb1;
+	uint64x2_t dma_addr0, dma_addr1;
+	uint64x2_t zero = vdupq_n_u64(0);
+	uint64_t paddr;
+	uint8x8_t p;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (unlikely(rte_mempool_get_bulk(rxq->mb_pool,
+					  (void *)rxep,
+					  RTE_IXGBE_RXQ_REARM_THRESH) < 0)) {
+		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
+				rxep[i].mbuf = &rxq->fake_mbuf;
+				vst1q_u64((uint64_t *)&rxdp[i].read,
+					  zero);
+			}
+		}
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_IXGBE_RXQ_REARM_THRESH;
+		return;
+	}
+
+	p = vld1_u8((uint8_t *)&rxq->mbuf_initializer);
+
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		/*
+		 * Flush mbuf with pkt template.
+		 * Data to be rearmed is 6 bytes long.
+		 * Though, RX will overwrite ol_flags that are coming next
+		 * anyway. So overwrite whole 8 bytes with one load:
+		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
+		 */
+		vst1_u8((uint8_t *)&mb0->rearm_data, p);
+		paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
+		dma_addr0 = vsetq_lane_u64(paddr, zero, 0);
+		/* flush desc with pa dma_addr */
+		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+
+		vst1_u8((uint8_t *)&mb1->rearm_data, p);
+		paddr = mb1->buf_physaddr + RTE_PKTMBUF_HEADROOM;
+		dma_addr1 = vsetq_lane_u64(paddr, zero, 0);
+		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
+	}
+
+	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+}
+
+/* Handling the offload flags (olflags) field takes computation
+ * time when receiving packets. Therefore we provide a flag to disable
+ * the processing of the olflags field when they are not needed. This
+ * gives improved performance, at the cost of losing the offload info
+ * in the received packet
+ */
+#ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
+
+#define VTAG_SHIFT     (3)
+
+static inline void
+desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
+		  uint8x16_t staterr, struct rte_mbuf **rx_pkts)
+{
+	uint8x16_t ptype;
+	uint8x16_t vtag;
+
+	union {
+		uint8_t e[4];
+		uint32_t word;
+	} vol;
+
+	const uint8x16_t pkttype_msk = {
+			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
+			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00};
+
+	const uint8x16_t rsstype_msk = {
+			0x0F, 0x0F, 0x0F, 0x0F,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00};
+
+	const uint8x16_t rss_flags = {
+			0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+			0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH,
+			PKT_RX_RSS_HASH, 0, 0, 0,
+			0, 0, 0, PKT_RX_FDIR};
+
+	ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0];
+	ptype = vandq_u8(ptype, rsstype_msk);
+	ptype = vqtbl1q_u8(rss_flags, ptype);
+
+	vtag = vshrq_n_u8(staterr, VTAG_SHIFT);
+	vtag = vandq_u8(vtag, pkttype_msk);
+	vtag = vorrq_u8(ptype, vtag);
+
+	vol.word = vgetq_lane_u32(vreinterpretq_u32_u8(vtag), 0);
+
+	rx_pkts[0]->ol_flags = vol.e[0];
+	rx_pkts[1]->ol_flags = vol.e[1];
+	rx_pkts[2]->ol_flags = vol.e[2];
+	rx_pkts[3]->ol_flags = vol.e[3];
+}
+#else
+#define desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, rx_pkts)
+#endif
+
+/*
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ * - don't support ol_flags for rss and csum err
+ */
+
+#define IXGBE_VPMD_DESC_DD_MASK		0x01010101
+#define IXGBE_VPMD_DESC_EOP_MASK	0x02020202
+
+static inline uint16_t
+_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+	uint64_t var;
+	uint8x16_t shuf_msk = {
+		0xFF, 0xFF,
+		0xFF, 0xFF,  /* skip 32 bits pkt_type */
+		12, 13,      /* octet 12~13, low 16 bits pkt_len */
+		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
+		12, 13,      /* octet 12~13, 16 bits data_len */
+		14, 15,      /* octet 14~15, low 16 bits vlan_macip */
+		4, 5, 6, 7  /* octet 4~7, 32bits rss */
+		};
+	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
+				 rxq->crc_len, 0, 0, 0};
+
+	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
+	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
+
+	/* Just the act of getting into the function from the application is
+	 * going to cost about 7 cycles */
+	rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch_non_temporal(rxdp);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act */
+	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
+		ixgbe_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available */
+	if (!(rxdp->wb.upper.status_error &
+				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
+		return 0;
+
+	/* Cache is empty -> need to scan the buffer rings, but first move
+	 * the next 'n' mbufs into the cache */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packet in one loop
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. calc the number of DD bits among the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info. from desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+			pos += RTE_IXGBE_DESCS_PER_LOOP,
+			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
+		uint64x2_t descs[RTE_IXGBE_DESCS_PER_LOOP];
+		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		uint8x16x2_t sterr_tmp1, sterr_tmp2;
+		uint64x2_t mbp1, mbp2;
+		uint8x16_t staterr;
+		uint16x8_t tmp;
+		uint32_t stat;
+
+		/* B.1 load 1 mbuf point */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+		/* Read desc statuses backwards to avoid race condition */
+		/* A.1 load 4 pkts desc */
+		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
+		rte_rmb();
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+		/* B.1 load 1 mbuf point */
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
+		/* B.1 load 2 mbuf point */
+		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
+		descs[0] =  vld1q_u64((uint64_t *)(rxdp));
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		if (split_packet) {
+			rte_prefetch_non_temporal(&rx_pkts[pos]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+1]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+2]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+3]->cacheline1);
+		}
+
+		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+
+		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]),
+				      vreinterpretq_u8_u64(descs[3]));
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]),
+				      vreinterpretq_u8_u64(descs[2]));
+
+		/* C.2 get 4 pkts staterr value  */
+		staterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0];
+		stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
+
+		/* set ol_flags with vlan packet type */
+		desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr,
+				  &rx_pkts[pos]);
+
+		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+		pkt_mb4 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+
+		/* D.3 copy final 3,4 data to rx_pkts */
+		vst1q_u8((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
+			 pkt_mb4);
+		vst1q_u8((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
+			 pkt_mb3);
+
+		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			/* and with mask to extract bits, flipping 1-0 */
+			*(int *)split_packet = ~stat & IXGBE_VPMD_DESC_EOP_MASK;
+
+			split_packet += RTE_IXGBE_DESCS_PER_LOOP;
+
+			/* zero-out next pointers */
+			rx_pkts[pos]->next = NULL;
+			rx_pkts[pos + 1]->next = NULL;
+			rx_pkts[pos + 2]->next = NULL;
+			rx_pkts[pos + 3]->next = NULL;
+		}
+
+		rte_prefetch_non_temporal(rxdp + RTE_IXGBE_DESCS_PER_LOOP);
+
+		/* D.3 copy final 1,2 data to rx_pkts */
+		vst1q_u8((uint8_t *)&rx_pkts[pos+1]->rx_descriptor_fields1,
+			 pkt_mb2);
+		vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
+			 pkt_mb1);
+
+		/* C.4 calc avaialbe number of desc */
+		var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
+		nb_pkts_recd += var;
+		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+			break;
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
+/*
+ * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ * - don't support ol_flags for rss and csum err
+ */
+uint16_t
+ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+/*
+ * vPMD receive routine that reassembles scattered packets
+ *
+ * Notice:
+ * - don't support ol_flags for rss and csum err
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ixgbe_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+			split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+	if (rxq->pkt_first_seg == NULL &&
+			split_fl64[0] == 0 && split_fl64[1] == 0 &&
+			split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned i = 0;
+	if (rxq->pkt_first_seg == NULL) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+		&split_flags[i]);
+}
+
+static inline void
+vtx1(volatile union ixgbe_adv_tx_desc *txdp,
+		struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64x2_t descriptor = {
+			pkt->buf_physaddr + pkt->data_off,
+			(uint64_t)pkt->pkt_len << 46 | flags | pkt->data_len};
+
+	vst1q_u64((uint64_t *)&txdp->read, descriptor);
+}
+
+static inline void
+vtx(volatile union ixgbe_adv_tx_desc *txdp,
+		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	int i;
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		vtx1(txdp, *pkt, flags);
+}
+
+uint16_t
+ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		       uint16_t nb_pkts)
+{
+	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
+	volatile union ixgbe_adv_tx_desc *txdp;
+	struct ixgbe_tx_entry_v *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = DCMD_DTYP_FLAGS;
+	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ixgbe_tx_free_bufs(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = &txq->sw_ring_v[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+
+		tx_backlog_entry(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			vtx1(txdp, *tx_pkts, flags);
+
+		vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &(txq->tx_ring[tx_id]);
+		txep = &txq->sw_ring_v[tx_id];
+	}
+
+	tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+	vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
+			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
+		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
+			txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+static void __attribute__((cold))
+ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
+{
+	_ixgbe_tx_queue_release_mbufs_vec(txq);
+}
+
+void __attribute__((cold))
+ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
+{
+	_ixgbe_rx_queue_release_mbufs_vec(rxq);
+}
+
+static void __attribute__((cold))
+ixgbe_tx_free_swring(struct ixgbe_tx_queue *txq)
+{
+	_ixgbe_tx_free_swring_vec(txq);
+}
+
+static void __attribute__((cold))
+ixgbe_reset_tx_queue(struct ixgbe_tx_queue *txq)
+{
+	_ixgbe_reset_tx_queue_vec(txq);
+}
+
+static const struct ixgbe_txq_ops vec_txq_ops = {
+	.release_mbufs = ixgbe_tx_queue_release_mbufs_vec,
+	.free_swring = ixgbe_tx_free_swring,
+	.reset = ixgbe_reset_tx_queue,
+};
+
+int __attribute__((cold))
+ixgbe_rxq_vec_setup(struct ixgbe_rx_queue *rxq)
+{
+	return ixgbe_rxq_vec_setup_default(rxq);
+}
+
+int __attribute__((cold))
+ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq)
+{
+	return ixgbe_txq_vec_setup_default(txq, &vec_txq_ops);
+}
+
+int __attribute__((cold))
+ixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
+{
+	return ixgbe_rx_vec_dev_conf_condition_check_default(dev);
+}
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
                   ` (5 preceding siblings ...)
  2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
@ 2016-04-26 13:55 ` Jianbo Liu
  2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
  2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
  8 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-26 13:55 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 config/defconfig_arm64-armv8a-linuxapp-gcc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index 9abeca4..98cc054 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -42,7 +42,6 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 CONFIG_RTE_TOOLCHAIN="gcc"
 CONFIG_RTE_TOOLCHAIN_GCC=y
 
-CONFIG_RTE_IXGBE_INC_VECTOR=n
 CONFIG_RTE_LIBRTE_IVSHMEM=n
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
                   ` (6 preceding siblings ...)
  2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
@ 2016-04-26 13:55 ` Jianbo Liu
  2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
  8 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-04-26 13:55 UTC (permalink / raw)
  To: dev, jerin.jacob, helin.zhang, konstantin.ananyev; +Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1953ea2..20158e3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -142,6 +142,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 
 EZchip TILE-Gx
 M: Zhigang Lu <zlu@ezchip.com>
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/4] ixgbe: rearrange vector PMD code for x86
  2016-04-26 13:50 ` [dpdk-dev] [PATCH v2 " Jianbo Liu
@ 2016-05-03  5:51   ` Jianbo Liu
  2016-05-03 16:29   ` Bruce Richardson
  1 sibling, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-03  5:51 UTC (permalink / raw)
  To: dev, Jerin Jacob, Zhang, Helin, Ananyev, Konstantin

Ping, please review this updated version...

On 26 April 2016 at 21:50, Jianbo Liu <jianbo.liu@linaro.org> wrote:
> move common code to new file "ixgbe_rxtx_vec_common.h",
> and vPMD for x86 is implemented in ixgbe_rxtx_vec.c
>
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> Suggested-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  drivers/net/ixgbe/ixgbe_rxtx_vec.c        | 256 +----------------------
>  drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 325 ++++++++++++++++++++++++++++++
>  2 files changed, 333 insertions(+), 248 deletions(-)
>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/4] ixgbe: rearrange vector PMD code for x86
  2016-04-26 13:50 ` [dpdk-dev] [PATCH v2 " Jianbo Liu
  2016-05-03  5:51   ` Jianbo Liu
@ 2016-05-03 16:29   ` Bruce Richardson
  1 sibling, 0 replies; 25+ messages in thread
From: Bruce Richardson @ 2016-05-03 16:29 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, jerin.jacob, helin.zhang, konstantin.ananyev

On Tue, Apr 26, 2016 at 09:50:40PM +0800, Jianbo Liu wrote:
> move common code to new file "ixgbe_rxtx_vec_common.h",
> and vPMD for x86 is implemented in ixgbe_rxtx_vec.c
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> Suggested-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  drivers/net/ixgbe/ixgbe_rxtx_vec.c        | 256 +----------------------
>  drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 325 ++++++++++++++++++++++++++++++
>  2 files changed, 333 insertions(+), 248 deletions(-)
>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
> 
Hi Jianbo,

thanks for this, it looks a better approach to me.

However, be aware that the following commit for ixgbe code cleanup has already
been applied to the next-net/rel_16_07 branch and your patches need to take it
account:
http://dpdk.org/browse/next/dpdk-next-net/commit/?h=rel_16_07&id=3b060a97b18a88652bd401240c004f604739f6d2

Patch 1 does not apply cleanly because of this cleanup change, and, more importantly,
the cleanups are not preserved when moving the code to the new header file.
Can you please rework the patches off the top of the dpdk-next-net/rel_16_07 branch
so that we don't lose these cleanups.

Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3 0/4] ixgbe: enable ixgbe vector PMD on ARM
  2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
                   ` (7 preceding siblings ...)
  2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
@ 2016-05-06  6:25 ` Jianbo Liu
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
                     ` (4 more replies)
  8 siblings, 5 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-06  6:25 UTC (permalink / raw)
  To: dev, bruce.richardson, jerin.jacob, helin.zhang, konstantin.ananyev
  Cc: Jianbo Liu

Implement ixgbe vPMD on ARM with NEON intrinsic.

v3:
 - rebase to rel_16_07 branch on dpdk-next-net.

v2:
 - move the common code to new header file.

Jianbo Liu (4):
  ixgbe: rearrange vector PMD code for x86
  ixgbe: implement vector PMD for arm architecture
  ixgbe: enable ixgbe vector PMD on ARMv8a platform
  maintainers: claim responsibility for ixgbe vector PMD on ARM

 MAINTAINERS                                |   1 +
 config/defconfig_arm64-armv8a-linuxapp-gcc |   1 -
 drivers/net/ixgbe/Makefile                 |   4 +
 drivers/net/ixgbe/ixgbe_rxtx_vec.c         | 258 +------------
 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h  | 327 +++++++++++++++++
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c    | 561 +++++++++++++++++++++++++++++
 6 files changed, 901 insertions(+), 251 deletions(-)
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c

-- 
2.4.11

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3 1/4] ixgbe: rearrange vector PMD code for x86
  2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
@ 2016-05-06  6:25   ` Jianbo Liu
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-06  6:25 UTC (permalink / raw)
  To: dev, bruce.richardson, jerin.jacob, helin.zhang, konstantin.ananyev
  Cc: Jianbo Liu

move common code to new file "ixgbe_rxtx_vec_common.h",
and vPMD for x86 is implemented in ixgbe_rxtx_vec.c

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
Suggested-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/ixgbe/ixgbe_rxtx_vec.c        | 258 +----------------------
 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h | 327 ++++++++++++++++++++++++++++++
 2 files changed, 335 insertions(+), 250 deletions(-)
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_common.h

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index c4d709b..5e2d621 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -37,6 +37,7 @@
 
 #include "ixgbe_ethdev.h"
 #include "ixgbe_rxtx.h"
+#include "ixgbe_rxtx_vec_common.h"
 
 #include <tmmintrin.h>
 
@@ -420,69 +421,6 @@ ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-static inline uint16_t
-reassemble_packets(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_bufs,
-		   uint16_t nb_bufs, uint8_t *split_flags)
-{
-	struct rte_mbuf *pkts[nb_bufs]; /*finished pkts*/
-	struct rte_mbuf *start = rxq->pkt_first_seg;
-	struct rte_mbuf *end =  rxq->pkt_last_seg;
-	unsigned int pkt_idx, buf_idx;
-
-	for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {
-		if (end != NULL) {
-			/* processing a split packet */
-			end->next = rx_bufs[buf_idx];
-			rx_bufs[buf_idx]->data_len += rxq->crc_len;
-
-			start->nb_segs++;
-			start->pkt_len += rx_bufs[buf_idx]->data_len;
-			end = end->next;
-
-			if (!split_flags[buf_idx]) {
-				/* it's the last packet of the set */
-				start->hash = end->hash;
-				start->ol_flags = end->ol_flags;
-				/* we need to strip crc for the whole packet */
-				start->pkt_len -= rxq->crc_len;
-				if (end->data_len > rxq->crc_len)
-					end->data_len -= rxq->crc_len;
-				else {
-					/* free up last mbuf */
-					struct rte_mbuf *secondlast = start;
-
-					start->nb_segs--;
-					while (secondlast->next != end)
-						secondlast = secondlast->next;
-					secondlast->data_len -= (rxq->crc_len -
-							end->data_len);
-					secondlast->next = NULL;
-					rte_pktmbuf_free_seg(end);
-					end = secondlast;
-				}
-				pkts[pkt_idx++] = start;
-				start = end = NULL;
-			}
-		} else {
-			/* not processing a split packet */
-			if (!split_flags[buf_idx]) {
-				/* not a split packet, save and skip */
-				pkts[pkt_idx++] = rx_bufs[buf_idx];
-				continue;
-			}
-			end = start = rx_bufs[buf_idx];
-			rx_bufs[buf_idx]->data_len += rxq->crc_len;
-			rx_bufs[buf_idx]->pkt_len += rxq->crc_len;
-		}
-	}
-
-	/* save the partial packet for next time */
-	rxq->pkt_first_seg = start;
-	rxq->pkt_last_seg = end;
-	memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));
-	return pkt_idx;
-}
-
 /*
  * vPMD receive routine that reassembles scattered packets
  *
@@ -546,73 +484,6 @@ vtx(volatile union ixgbe_adv_tx_desc *txdp,
 		vtx1(txdp, *pkt, flags);
 }
 
-static inline int __attribute__((always_inline))
-ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
-{
-	struct ixgbe_tx_entry_v *txep;
-	uint32_t status;
-	uint32_t n;
-	uint32_t i;
-	int nb_free = 0;
-	struct rte_mbuf *m, *free[RTE_IXGBE_TX_MAX_FREE_BUF_SZ];
-
-	/* check DD bit on threshold descriptor */
-	status = txq->tx_ring[txq->tx_next_dd].wb.status;
-	if (!(status & IXGBE_ADVTXD_STAT_DD))
-		return 0;
-
-	n = txq->tx_rs_thresh;
-
-	/*
-	 * first buffer to free from S/W ring is at index
-	 * tx_next_dd - (tx_rs_thresh-1)
-	 */
-	txep = &txq->sw_ring_v[txq->tx_next_dd - (n - 1)];
-	m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
-	if (likely(m != NULL)) {
-		free[0] = m;
-		nb_free = 1;
-		for (i = 1; i < n; i++) {
-			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
-			if (likely(m != NULL)) {
-				if (likely(m->pool == free[0]->pool))
-					free[nb_free++] = m;
-				else {
-					rte_mempool_put_bulk(free[0]->pool,
-							(void *)free, nb_free);
-					free[0] = m;
-					nb_free = 1;
-				}
-			}
-		}
-		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
-	} else {
-		for (i = 1; i < n; i++) {
-			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
-			if (m != NULL)
-				rte_mempool_put(m->pool, m);
-		}
-	}
-
-	/* buffers were freed, update counters */
-	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
-	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
-	if (txq->tx_next_dd >= txq->nb_tx_desc)
-		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
-
-	return txq->tx_rs_thresh;
-}
-
-static inline void __attribute__((always_inline))
-tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
-		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
-{
-	int i;
-
-	for (i = 0; i < (int)nb_pkts; ++i)
-		txep[i].mbuf = tx_pkts[i];
-}
-
 uint16_t
 ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 		       uint16_t nb_pkts)
@@ -683,92 +554,25 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 static void __attribute__((cold))
 ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
 {
-	unsigned int i;
-	struct ixgbe_tx_entry_v *txe;
-	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
-
-	if (txq->sw_ring == NULL || txq->nb_tx_free == max_desc)
-		return;
-
-	/* release the used mbufs in sw_ring */
-	for (i = txq->tx_next_dd - (txq->tx_rs_thresh - 1);
-	     i != txq->tx_tail;
-	     i = (i + 1) & max_desc) {
-		txe = &txq->sw_ring_v[i];
-		rte_pktmbuf_free_seg(txe->mbuf);
-	}
-	txq->nb_tx_free = max_desc;
-
-	/* reset tx_entry */
-	for (i = 0; i < txq->nb_tx_desc; i++) {
-		txe = &txq->sw_ring_v[i];
-		txe->mbuf = NULL;
-	}
+	_ixgbe_tx_queue_release_mbufs_vec(txq);
 }
 
 void __attribute__((cold))
 ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
 {
-	const unsigned int mask = rxq->nb_rx_desc - 1;
-	unsigned int i;
-
-	if (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_rx_desc)
-		return;
-
-	/* free all mbufs that are valid in the ring */
-	for (i = rxq->rx_tail; i != rxq->rxrearm_start; i = (i + 1) & mask)
-		rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
-	rxq->rxrearm_nb = rxq->nb_rx_desc;
-
-	/* set all entries to NULL */
-	memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);
+	_ixgbe_rx_queue_release_mbufs_vec(rxq);
 }
 
 static void __attribute__((cold))
 ixgbe_tx_free_swring(struct ixgbe_tx_queue *txq)
 {
-	if (txq == NULL)
-		return;
-
-	if (txq->sw_ring != NULL) {
-		rte_free(txq->sw_ring_v - 1);
-		txq->sw_ring_v = NULL;
-	}
+	_ixgbe_tx_free_swring_vec(txq);
 }
 
 static void __attribute__((cold))
 ixgbe_reset_tx_queue(struct ixgbe_tx_queue *txq)
 {
-	static const union ixgbe_adv_tx_desc zeroed_desc = { { 0 } };
-	struct ixgbe_tx_entry_v *txe = txq->sw_ring_v;
-	uint16_t i;
-
-	/* Zero out HW ring memory */
-	for (i = 0; i < txq->nb_tx_desc; i++)
-		txq->tx_ring[i] = zeroed_desc;
-
-	/* Initialize SW ring entries */
-	for (i = 0; i < txq->nb_tx_desc; i++) {
-		volatile union ixgbe_adv_tx_desc *txd = &txq->tx_ring[i];
-
-		txd->wb.status = IXGBE_TXD_STAT_DD;
-		txe[i].mbuf = NULL;
-	}
-
-	txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
-	txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
-
-	txq->tx_tail = 0;
-	txq->nb_tx_used = 0;
-	/*
-	 * Always allow 1 descriptor to be un-allocated to avoid
-	 * a H/W race condition
-	 */
-	txq->last_desc_cleaned = (uint16_t)(txq->nb_tx_desc - 1);
-	txq->nb_tx_free = (uint16_t)(txq->nb_tx_desc - 1);
-	txq->ctx_curr = 0;
-	memset((void *)&txq->ctx_cache, 0,
-		IXGBE_CTX_NUM * sizeof(struct ixgbe_advctx_info));
+	_ixgbe_reset_tx_queue_vec(txq);
 }
 
 static const struct ixgbe_txq_ops vec_txq_ops = {
@@ -780,63 +584,17 @@ static const struct ixgbe_txq_ops vec_txq_ops = {
 int __attribute__((cold))
 ixgbe_rxq_vec_setup(struct ixgbe_rx_queue *rxq)
 {
-	uintptr_t p;
-	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
-
-	mb_def.nb_segs = 1;
-	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
-	mb_def.port = rxq->port_id;
-	rte_mbuf_refcnt_set(&mb_def, 1);
-
-	/* prevent compiler reordering: rearm_data covers previous fields */
-	rte_compiler_barrier();
-	p = (uintptr_t)&mb_def.rearm_data;
-	rxq->mbuf_initializer = *(uint64_t *)p;
-	return 0;
+	return ixgbe_rxq_vec_setup_default(rxq);
 }
 
 int __attribute__((cold))
 ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq)
 {
-	if (txq->sw_ring_v == NULL)
-		return -1;
-
-	/* leave the first one for overflow */
-	txq->sw_ring_v = txq->sw_ring_v + 1;
-	txq->ops = &vec_txq_ops;
-
-	return 0;
+	return ixgbe_txq_vec_setup_default(txq, &vec_txq_ops);
 }
 
 int __attribute__((cold))
 ixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
 {
-#ifndef RTE_LIBRTE_IEEE1588
-	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
-	struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
-
-#ifndef RTE_IXGBE_RX_OLFLAGS_ENABLE
-	/* whithout rx ol_flags, no VP flag report */
-	if (rxmode->hw_vlan_strip != 0 ||
-	    rxmode->hw_vlan_extend != 0)
-		return -1;
-#endif
-
-	/* no fdir support */
-	if (fconf->mode != RTE_FDIR_MODE_NONE)
-		return -1;
-
-	/*
-	 * - no csum error report support
-	 * - no header split support
-	 */
-	if (rxmode->hw_ip_checksum == 1 ||
-	    rxmode->header_split == 1)
-		return -1;
-
-	return 0;
-#else
-	RTE_SET_USED(dev);
-	return -1;
-#endif
+	return ixgbe_rx_vec_dev_conf_condition_check_default(dev);
 }
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
new file mode 100644
index 0000000..e98fb9d
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_common.h
@@ -0,0 +1,327 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IXGBE_RXTX_VEC_COMMON_H_
+#define _IXGBE_RXTX_VEC_COMMON_H_
+#include <stdint.h>
+#include <rte_ethdev.h>
+
+#include "ixgbe_ethdev.h"
+#include "ixgbe_rxtx.h"
+
+static inline uint16_t
+reassemble_packets(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_bufs,
+		   uint16_t nb_bufs, uint8_t *split_flags)
+{
+	struct rte_mbuf *pkts[nb_bufs]; /*finished pkts*/
+	struct rte_mbuf *start = rxq->pkt_first_seg;
+	struct rte_mbuf *end =  rxq->pkt_last_seg;
+	unsigned int pkt_idx, buf_idx;
+
+	for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {
+		if (end != NULL) {
+			/* processing a split packet */
+			end->next = rx_bufs[buf_idx];
+			rx_bufs[buf_idx]->data_len += rxq->crc_len;
+
+			start->nb_segs++;
+			start->pkt_len += rx_bufs[buf_idx]->data_len;
+			end = end->next;
+
+			if (!split_flags[buf_idx]) {
+				/* it's the last packet of the set */
+				start->hash = end->hash;
+				start->ol_flags = end->ol_flags;
+				/* we need to strip crc for the whole packet */
+				start->pkt_len -= rxq->crc_len;
+				if (end->data_len > rxq->crc_len)
+					end->data_len -= rxq->crc_len;
+				else {
+					/* free up last mbuf */
+					struct rte_mbuf *secondlast = start;
+
+					start->nb_segs--;
+					while (secondlast->next != end)
+						secondlast = secondlast->next;
+					secondlast->data_len -= (rxq->crc_len -
+							end->data_len);
+					secondlast->next = NULL;
+					rte_pktmbuf_free_seg(end);
+					end = secondlast;
+				}
+				pkts[pkt_idx++] = start;
+				start = end = NULL;
+			}
+		} else {
+			/* not processing a split packet */
+			if (!split_flags[buf_idx]) {
+				/* not a split packet, save and skip */
+				pkts[pkt_idx++] = rx_bufs[buf_idx];
+				continue;
+			}
+			end = start = rx_bufs[buf_idx];
+			rx_bufs[buf_idx]->data_len += rxq->crc_len;
+			rx_bufs[buf_idx]->pkt_len += rxq->crc_len;
+		}
+	}
+
+	/* save the partial packet for next time */
+	rxq->pkt_first_seg = start;
+	rxq->pkt_last_seg = end;
+	memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));
+	return pkt_idx;
+}
+
+static inline int __attribute__((always_inline))
+ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
+{
+	struct ixgbe_tx_entry_v *txep;
+	uint32_t status;
+	uint32_t n;
+	uint32_t i;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[RTE_IXGBE_TX_MAX_FREE_BUF_SZ];
+
+	/* check DD bit on threshold descriptor */
+	status = txq->tx_ring[txq->tx_next_dd].wb.status;
+	if (!(status & IXGBE_ADVTXD_STAT_DD))
+		return 0;
+
+	n = txq->tx_rs_thresh;
+
+	/*
+	 * first buffer to free from S/W ring is at index
+	 * tx_next_dd - (tx_rs_thresh-1)
+	 */
+	txep = &txq->sw_ring_v[txq->tx_next_dd - (n - 1)];
+	m = __rte_pktmbuf_prefree_seg(txep[0].mbuf);
+	if (likely(m != NULL)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < n; i++) {
+			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (likely(m != NULL)) {
+				if (likely(m->pool == free[0]->pool))
+					free[nb_free++] = m;
+				else {
+					rte_mempool_put_bulk(free[0]->pool,
+							(void *)free, nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < n; i++) {
+			m = __rte_pktmbuf_prefree_seg(txep[i].mbuf);
+			if (m != NULL)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+	/* buffers were freed, update counters */
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+	if (txq->tx_next_dd >= txq->nb_tx_desc)
+		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+	return txq->tx_rs_thresh;
+}
+
+static inline void __attribute__((always_inline))
+tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
+		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	int i;
+
+	for (i = 0; i < (int)nb_pkts; ++i)
+		txep[i].mbuf = tx_pkts[i];
+}
+
+static inline void
+_ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
+{
+	unsigned int i;
+	struct ixgbe_tx_entry_v *txe;
+	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
+
+	if (txq->sw_ring == NULL || txq->nb_tx_free == max_desc)
+		return;
+
+	/* release the used mbufs in sw_ring */
+	for (i = txq->tx_next_dd - (txq->tx_rs_thresh - 1);
+	     i != txq->tx_tail;
+	     i = (i + 1) & max_desc) {
+		txe = &txq->sw_ring_v[i];
+		rte_pktmbuf_free_seg(txe->mbuf);
+	}
+	txq->nb_tx_free = max_desc;
+
+	/* reset tx_entry */
+	for (i = 0; i < txq->nb_tx_desc; i++) {
+		txe = &txq->sw_ring_v[i];
+		txe->mbuf = NULL;
+	}
+}
+
+static inline void
+_ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
+{
+	const unsigned int mask = rxq->nb_rx_desc - 1;
+	unsigned int i;
+
+	if (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_rx_desc)
+		return;
+
+	/* free all mbufs that are valid in the ring */
+	for (i = rxq->rx_tail; i != rxq->rxrearm_start; i = (i + 1) & mask)
+		rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+	rxq->rxrearm_nb = rxq->nb_rx_desc;
+
+	/* set all entries to NULL */
+	memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);
+}
+
+static inline void
+_ixgbe_tx_free_swring_vec(struct ixgbe_tx_queue *txq)
+{
+	if (txq == NULL)
+		return;
+
+	if (txq->sw_ring != NULL) {
+		rte_free(txq->sw_ring_v - 1);
+		txq->sw_ring_v = NULL;
+	}
+}
+
+static inline void
+_ixgbe_reset_tx_queue_vec(struct ixgbe_tx_queue *txq)
+{
+	static const union ixgbe_adv_tx_desc zeroed_desc = { { 0 } };
+	struct ixgbe_tx_entry_v *txe = txq->sw_ring_v;
+	uint16_t i;
+
+	/* Zero out HW ring memory */
+	for (i = 0; i < txq->nb_tx_desc; i++)
+		txq->tx_ring[i] = zeroed_desc;
+
+	/* Initialize SW ring entries */
+	for (i = 0; i < txq->nb_tx_desc; i++) {
+		volatile union ixgbe_adv_tx_desc *txd = &txq->tx_ring[i];
+
+		txd->wb.status = IXGBE_TXD_STAT_DD;
+		txe[i].mbuf = NULL;
+	}
+
+	txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+	txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+	txq->tx_tail = 0;
+	txq->nb_tx_used = 0;
+	/*
+	 * Always allow 1 descriptor to be un-allocated to avoid
+	 * a H/W race condition
+	 */
+	txq->last_desc_cleaned = (uint16_t)(txq->nb_tx_desc - 1);
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_desc - 1);
+	txq->ctx_curr = 0;
+	memset((void *)&txq->ctx_cache, 0,
+		IXGBE_CTX_NUM * sizeof(struct ixgbe_advctx_info));
+}
+
+static inline int
+ixgbe_rxq_vec_setup_default(struct ixgbe_rx_queue *rxq)
+{
+	uintptr_t p;
+	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+	mb_def.nb_segs = 1;
+	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+	mb_def.port = rxq->port_id;
+	rte_mbuf_refcnt_set(&mb_def, 1);
+
+	/* prevent compiler reordering: rearm_data covers previous fields */
+	rte_compiler_barrier();
+	p = (uintptr_t)&mb_def.rearm_data;
+	rxq->mbuf_initializer = *(uint64_t *)p;
+	return 0;
+}
+
+static inline int
+ixgbe_txq_vec_setup_default(struct ixgbe_tx_queue *txq,
+			    const struct ixgbe_txq_ops *txq_ops)
+{
+	if (txq->sw_ring_v == NULL)
+		return -1;
+
+	/* leave the first one for overflow */
+	txq->sw_ring_v = txq->sw_ring_v + 1;
+	txq->ops = txq_ops;
+
+	return 0;
+}
+
+static inline int
+ixgbe_rx_vec_dev_conf_condition_check_default(struct rte_eth_dev *dev)
+{
+#ifndef RTE_LIBRTE_IEEE1588
+	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+	struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
+
+#ifndef RTE_IXGBE_RX_OLFLAGS_ENABLE
+	/* whithout rx ol_flags, no VP flag report */
+	if (rxmode->hw_vlan_strip != 0 ||
+	    rxmode->hw_vlan_extend != 0)
+		return -1;
+#endif
+
+	/* no fdir support */
+	if (fconf->mode != RTE_FDIR_MODE_NONE)
+		return -1;
+
+	/*
+	 * - no csum error report support
+	 * - no header split support
+	 */
+	if (rxmode->hw_ip_checksum == 1 ||
+	    rxmode->header_split == 1)
+		return -1;
+
+	return 0;
+#else
+	RTE_SET_USED(dev);
+	return -1;
+#endif
+}
+#endif
-- 
2.4.11

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture
  2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
@ 2016-05-06  6:25   ` Jianbo Liu
  2016-05-10 14:49     ` Bruce Richardson
  2016-05-25 12:29     ` Jerin Jacob
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
                     ` (2 subsequent siblings)
  4 siblings, 2 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-06  6:25 UTC (permalink / raw)
  To: dev, bruce.richardson, jerin.jacob, helin.zhang, konstantin.ananyev
  Cc: Jianbo Liu

use ARM NEON intrinsic to implement ixgbe vPMD

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 drivers/net/ixgbe/Makefile              |   4 +
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 561 ++++++++++++++++++++++++++++++++
 2 files changed, 565 insertions(+)
 create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c

diff --git a/drivers/net/ixgbe/Makefile b/drivers/net/ixgbe/Makefile
index 50bf51c..b1c7a60 100644
--- a/drivers/net/ixgbe/Makefile
+++ b/drivers/net/ixgbe/Makefile
@@ -108,7 +108,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_fdir.c
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_pf.c
+ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
+SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec_neon.c
+else
 SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec.c
+endif
 
 ifeq ($(CONFIG_RTE_NIC_BYPASS),y)
 SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_bypass.c
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
new file mode 100644
index 0000000..11a6115
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -0,0 +1,561 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+
+#include "ixgbe_ethdev.h"
+#include "ixgbe_rxtx.h"
+#include "ixgbe_rxtx_vec_common.h"
+
+#include <arm_neon.h>
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+static inline void
+ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mbuf *mb0, *mb1;
+	uint64x2_t dma_addr0, dma_addr1;
+	uint64x2_t zero = vdupq_n_u64(0);
+	uint64_t paddr;
+	uint8x8_t p;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (unlikely(rte_mempool_get_bulk(rxq->mb_pool,
+					  (void *)rxep,
+					  RTE_IXGBE_RXQ_REARM_THRESH) < 0)) {
+		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
+				rxep[i].mbuf = &rxq->fake_mbuf;
+				vst1q_u64((uint64_t *)&rxdp[i].read,
+					  zero);
+			}
+		}
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_IXGBE_RXQ_REARM_THRESH;
+		return;
+	}
+
+	p = vld1_u8((uint8_t *)&rxq->mbuf_initializer);
+
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		/*
+		 * Flush mbuf with pkt template.
+		 * Data to be rearmed is 6 bytes long.
+		 * Though, RX will overwrite ol_flags that are coming next
+		 * anyway. So overwrite whole 8 bytes with one load:
+		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
+		 */
+		vst1_u8((uint8_t *)&mb0->rearm_data, p);
+		paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
+		dma_addr0 = vsetq_lane_u64(paddr, zero, 0);
+		/* flush desc with pa dma_addr */
+		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+
+		vst1_u8((uint8_t *)&mb1->rearm_data, p);
+		paddr = mb1->buf_physaddr + RTE_PKTMBUF_HEADROOM;
+		dma_addr1 = vsetq_lane_u64(paddr, zero, 0);
+		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
+	}
+
+	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
+			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+}
+
+/* Handling the offload flags (olflags) field takes computation
+ * time when receiving packets. Therefore we provide a flag to disable
+ * the processing of the olflags field when they are not needed. This
+ * gives improved performance, at the cost of losing the offload info
+ * in the received packet
+ */
+#ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
+
+#define VTAG_SHIFT     (3)
+
+static inline void
+desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
+		  uint8x16_t staterr, struct rte_mbuf **rx_pkts)
+{
+	uint8x16_t ptype;
+	uint8x16_t vtag;
+
+	union {
+		uint8_t e[4];
+		uint32_t word;
+	} vol;
+
+	const uint8x16_t pkttype_msk = {
+			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
+			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00};
+
+	const uint8x16_t rsstype_msk = {
+			0x0F, 0x0F, 0x0F, 0x0F,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00};
+
+	const uint8x16_t rss_flags = {
+			0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+			0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH,
+			PKT_RX_RSS_HASH, 0, 0, 0,
+			0, 0, 0, PKT_RX_FDIR};
+
+	ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0];
+	ptype = vandq_u8(ptype, rsstype_msk);
+	ptype = vqtbl1q_u8(rss_flags, ptype);
+
+	vtag = vshrq_n_u8(staterr, VTAG_SHIFT);
+	vtag = vandq_u8(vtag, pkttype_msk);
+	vtag = vorrq_u8(ptype, vtag);
+
+	vol.word = vgetq_lane_u32(vreinterpretq_u32_u8(vtag), 0);
+
+	rx_pkts[0]->ol_flags = vol.e[0];
+	rx_pkts[1]->ol_flags = vol.e[1];
+	rx_pkts[2]->ol_flags = vol.e[2];
+	rx_pkts[3]->ol_flags = vol.e[3];
+}
+#else
+#define desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, rx_pkts)
+#endif
+
+/*
+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ * - don't support ol_flags for rss and csum err
+ */
+
+#define IXGBE_VPMD_DESC_DD_MASK		0x01010101
+#define IXGBE_VPMD_DESC_EOP_MASK	0x02020202
+
+static inline uint16_t
+_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	volatile union ixgbe_adv_rx_desc *rxdp;
+	struct ixgbe_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+	uint64_t var;
+	uint8x16_t shuf_msk = {
+		0xFF, 0xFF,
+		0xFF, 0xFF,  /* skip 32 bits pkt_type */
+		12, 13,      /* octet 12~13, low 16 bits pkt_len */
+		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
+		12, 13,      /* octet 12~13, 16 bits data_len */
+		14, 15,      /* octet 14~15, low 16 bits vlan_macip */
+		4, 5, 6, 7  /* octet 4~7, 32bits rss */
+		};
+	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
+				 rxq->crc_len, 0, 0, 0};
+
+	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
+
+	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
+
+	/* Just the act of getting into the function from the application is
+	 * going to cost about 7 cycles
+	 */
+	rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch_non_temporal(rxdp);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
+		ixgbe_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.upper.status_error &
+				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
+		return 0;
+
+	/* Cache is empty -> need to scan the buffer rings, but first move
+	 * the next 'n' mbufs into the cache
+	 */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packet in one loop
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. calc the number of DD bits among the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info. from desc to mbuf
+	 */
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+			pos += RTE_IXGBE_DESCS_PER_LOOP,
+			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
+		uint64x2_t descs[RTE_IXGBE_DESCS_PER_LOOP];
+		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		uint8x16x2_t sterr_tmp1, sterr_tmp2;
+		uint64x2_t mbp1, mbp2;
+		uint8x16_t staterr;
+		uint16x8_t tmp;
+		uint32_t stat;
+
+		/* B.1 load 1 mbuf point */
+		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+		/* Read desc statuses backwards to avoid race condition */
+		/* A.1 load 4 pkts desc */
+		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
+		rte_rmb();
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+		/* B.1 load 1 mbuf point */
+		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
+		/* B.1 load 2 mbuf point */
+		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
+		descs[0] =  vld1q_u64((uint64_t *)(rxdp));
+
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+		if (split_packet) {
+			rte_prefetch_non_temporal(&rx_pkts[pos]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+1]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+2]->cacheline1);
+			rte_prefetch_non_temporal(&rx_pkts[pos+3]->cacheline1);
+		}
+
+		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+
+		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]),
+				      vreinterpretq_u8_u64(descs[3]));
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]),
+				      vreinterpretq_u8_u64(descs[2]));
+
+		/* C.2 get 4 pkts staterr value  */
+		staterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0];
+		stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
+
+		/* set ol_flags with vlan packet type */
+		desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr,
+				  &rx_pkts[pos]);
+
+		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+		pkt_mb4 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+		pkt_mb3 = vreinterpretq_u8_u16(tmp);
+
+		/* D.3 copy final 3,4 data to rx_pkts */
+		vst1q_u8((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
+			 pkt_mb4);
+		vst1q_u8((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
+			 pkt_mb3);
+
+		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+		pkt_mb2 = vreinterpretq_u8_u16(tmp);
+		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+		pkt_mb1 = vreinterpretq_u8_u16(tmp);
+
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			/* and with mask to extract bits, flipping 1-0 */
+			*(int *)split_packet = ~stat & IXGBE_VPMD_DESC_EOP_MASK;
+
+			split_packet += RTE_IXGBE_DESCS_PER_LOOP;
+
+			/* zero-out next pointers */
+			rx_pkts[pos]->next = NULL;
+			rx_pkts[pos + 1]->next = NULL;
+			rx_pkts[pos + 2]->next = NULL;
+			rx_pkts[pos + 3]->next = NULL;
+		}
+
+		rte_prefetch_non_temporal(rxdp + RTE_IXGBE_DESCS_PER_LOOP);
+
+		/* D.3 copy final 1,2 data to rx_pkts */
+		vst1q_u8((uint8_t *)&rx_pkts[pos+1]->rx_descriptor_fields1,
+			 pkt_mb2);
+		vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
+			 pkt_mb1);
+
+		/* C.4 calc avaialbe number of desc */
+		var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
+		nb_pkts_recd += var;
+		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+			break;
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
+/*
+ * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
+ *
+ * Notice:
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ * - don't support ol_flags for rss and csum err
+ */
+uint16_t
+ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+/*
+ * vPMD receive routine that reassembles scattered packets
+ *
+ * Notice:
+ * - don't support ol_flags for rss and csum err
+ * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
+ *   numbers of DD bit
+ * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
+ */
+uint16_t
+ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts)
+{
+	struct ixgbe_rx_queue *rxq = rx_queue;
+	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
+
+	/* get some new buffers */
+	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+			split_flags);
+	if (nb_bufs == 0)
+		return 0;
+
+	/* happy day case, full burst + no packets to be joined */
+	const uint64_t *split_fl64 = (uint64_t *)split_flags;
+	if (rxq->pkt_first_seg == NULL &&
+			split_fl64[0] == 0 && split_fl64[1] == 0 &&
+			split_fl64[2] == 0 && split_fl64[3] == 0)
+		return nb_bufs;
+
+	/* reassemble any packets that need reassembly*/
+	unsigned i = 0;
+	if (rxq->pkt_first_seg == NULL) {
+		/* find the first split flag, and only reassemble then*/
+		while (i < nb_bufs && !split_flags[i])
+			i++;
+		if (i == nb_bufs)
+			return nb_bufs;
+	}
+	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+		&split_flags[i]);
+}
+
+static inline void
+vtx1(volatile union ixgbe_adv_tx_desc *txdp,
+		struct rte_mbuf *pkt, uint64_t flags)
+{
+	uint64x2_t descriptor = {
+			pkt->buf_physaddr + pkt->data_off,
+			(uint64_t)pkt->pkt_len << 46 | flags | pkt->data_len};
+
+	vst1q_u64((uint64_t *)&txdp->read, descriptor);
+}
+
+static inline void
+vtx(volatile union ixgbe_adv_tx_desc *txdp,
+		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+		vtx1(txdp, *pkt, flags);
+}
+
+uint16_t
+ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		       uint16_t nb_pkts)
+{
+	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
+	volatile union ixgbe_adv_tx_desc *txdp;
+	struct ixgbe_tx_entry_v *txep;
+	uint16_t n, nb_commit, tx_id;
+	uint64_t flags = DCMD_DTYP_FLAGS;
+	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
+	int i;
+
+	/* cross rx_thresh boundary is not allowed */
+	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+	if (txq->nb_tx_free < txq->tx_free_thresh)
+		ixgbe_tx_free_bufs(txq);
+
+	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	tx_id = txq->tx_tail;
+	txdp = &txq->tx_ring[tx_id];
+	txep = &txq->sw_ring_v[tx_id];
+
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+	n = (uint16_t)(txq->nb_tx_desc - tx_id);
+	if (nb_commit >= n) {
+
+		tx_backlog_entry(txep, tx_pkts, n);
+
+		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+			vtx1(txdp, *tx_pkts, flags);
+
+		vtx1(txdp, *tx_pkts++, rs);
+
+		nb_commit = (uint16_t)(nb_commit - n);
+
+		tx_id = 0;
+		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+		/* avoid reach the end of ring */
+		txdp = &(txq->tx_ring[tx_id]);
+		txep = &txq->sw_ring_v[tx_id];
+	}
+
+	tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+	vtx(txdp, tx_pkts, nb_commit, flags);
+
+	tx_id = (uint16_t)(tx_id + nb_commit);
+	if (tx_id > txq->tx_next_rs) {
+		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
+			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
+		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
+			txq->tx_rs_thresh);
+	}
+
+	txq->tx_tail = tx_id;
+
+	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+
+	return nb_pkts;
+}
+
+static void __attribute__((cold))
+ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
+{
+	_ixgbe_tx_queue_release_mbufs_vec(txq);
+}
+
+void __attribute__((cold))
+ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
+{
+	_ixgbe_rx_queue_release_mbufs_vec(rxq);
+}
+
+static void __attribute__((cold))
+ixgbe_tx_free_swring(struct ixgbe_tx_queue *txq)
+{
+	_ixgbe_tx_free_swring_vec(txq);
+}
+
+static void __attribute__((cold))
+ixgbe_reset_tx_queue(struct ixgbe_tx_queue *txq)
+{
+	_ixgbe_reset_tx_queue_vec(txq);
+}
+
+static const struct ixgbe_txq_ops vec_txq_ops = {
+	.release_mbufs = ixgbe_tx_queue_release_mbufs_vec,
+	.free_swring = ixgbe_tx_free_swring,
+	.reset = ixgbe_reset_tx_queue,
+};
+
+int __attribute__((cold))
+ixgbe_rxq_vec_setup(struct ixgbe_rx_queue *rxq)
+{
+	return ixgbe_rxq_vec_setup_default(rxq);
+}
+
+int __attribute__((cold))
+ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq)
+{
+	return ixgbe_txq_vec_setup_default(txq, &vec_txq_ops);
+}
+
+int __attribute__((cold))
+ixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
+{
+	return ixgbe_rx_vec_dev_conf_condition_check_default(dev);
+}
-- 
2.4.11

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform
  2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
@ 2016-05-06  6:25   ` Jianbo Liu
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
  2016-05-24 16:10   ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Bruce Richardson
  4 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-06  6:25 UTC (permalink / raw)
  To: dev, bruce.richardson, jerin.jacob, helin.zhang, konstantin.ananyev
  Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 config/defconfig_arm64-armv8a-linuxapp-gcc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index 9abeca4..98cc054 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -42,7 +42,6 @@ CONFIG_RTE_FORCE_INTRINSICS=y
 CONFIG_RTE_TOOLCHAIN="gcc"
 CONFIG_RTE_TOOLCHAIN_GCC=y
 
-CONFIG_RTE_IXGBE_INC_VECTOR=n
 CONFIG_RTE_LIBRTE_IVSHMEM=n
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
-- 
2.4.11

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM
  2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
                     ` (2 preceding siblings ...)
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
@ 2016-05-06  6:25   ` Jianbo Liu
  2016-05-24 16:10   ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Bruce Richardson
  4 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-06  6:25 UTC (permalink / raw)
  To: dev, bruce.richardson, jerin.jacob, helin.zhang, konstantin.ananyev
  Cc: Jianbo Liu

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index ba4053a..78b46e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -142,6 +142,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 
 EZchip TILE-Gx
 M: Zhigang Lu <zlu@ezchip.com>
-- 
2.4.11

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
@ 2016-05-10 14:49     ` Bruce Richardson
  2016-05-11  2:40       ` Jianbo Liu
  2016-05-25 12:29     ` Jerin Jacob
  1 sibling, 1 reply; 25+ messages in thread
From: Bruce Richardson @ 2016-05-10 14:49 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, jerin.jacob, helin.zhang, konstantin.ananyev

On Fri, May 06, 2016 at 11:55:46AM +0530, Jianbo Liu wrote:
> use ARM NEON intrinsic to implement ixgbe vPMD
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  drivers/net/ixgbe/Makefile              |   4 +
>  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 561 ++++++++++++++++++++++++++++++++
>  2 files changed, 565 insertions(+)
>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> 
> diff --git a/drivers/net/ixgbe/Makefile b/drivers/net/ixgbe/Makefile
> index 50bf51c..b1c7a60 100644
> --- a/drivers/net/ixgbe/Makefile
> +++ b/drivers/net/ixgbe/Makefile
> @@ -108,7 +108,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_rxtx.c
>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_ethdev.c
>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_fdir.c
>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_pf.c
> +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> +SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec_neon.c
> +else
>  SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec.c
> +endif
>  
Since you are adding ixgbe_rxtx_vec_neon.c here, it might be worthwhile adding
in an extra patch to rename ixgbe_rxtx_vec.c to ixgbe_rxtx_vec_sse.c for 
consistency.

Regards,
/Bruce

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture
  2016-05-10 14:49     ` Bruce Richardson
@ 2016-05-11  2:40       ` Jianbo Liu
  0 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-11  2:40 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev, Jerin Jacob, Zhang, Helin, Ananyev, Konstantin

On 10 May 2016 at 22:49, Bruce Richardson <bruce.richardson@intel.com> wrote:
> On Fri, May 06, 2016 at 11:55:46AM +0530, Jianbo Liu wrote:
>> use ARM NEON intrinsic to implement ixgbe vPMD
>>
>> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
>> ---
>>  drivers/net/ixgbe/Makefile              |   4 +
>>  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 561 ++++++++++++++++++++++++++++++++
>>  2 files changed, 565 insertions(+)
>>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>>
>> diff --git a/drivers/net/ixgbe/Makefile b/drivers/net/ixgbe/Makefile
>> index 50bf51c..b1c7a60 100644
>> --- a/drivers/net/ixgbe/Makefile
>> +++ b/drivers/net/ixgbe/Makefile
>> @@ -108,7 +108,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_rxtx.c
>>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_ethdev.c
>>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_fdir.c
>>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_pf.c
>> +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
>> +SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec_neon.c
>> +else
>>  SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec.c
>> +endif
>>
> Since you are adding ixgbe_rxtx_vec_neon.c here, it might be worthwhile adding
> in an extra patch to rename ixgbe_rxtx_vec.c to ixgbe_rxtx_vec_sse.c for
> consistency.
>
OK, I'll do that.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/4] ixgbe: enable ixgbe vector PMD on ARM
  2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
                     ` (3 preceding siblings ...)
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
@ 2016-05-24 16:10   ` Bruce Richardson
  2016-05-24 16:12     ` Bruce Richardson
  4 siblings, 1 reply; 25+ messages in thread
From: Bruce Richardson @ 2016-05-24 16:10 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, jerin.jacob, helin.zhang, konstantin.ananyev

On Fri, May 06, 2016 at 11:55:44AM +0530, Jianbo Liu wrote:
> Implement ixgbe vPMD on ARM with NEON intrinsic.
> 
> v3:
>  - rebase to rel_16_07 branch on dpdk-next-net.
> 
> v2:
>  - move the common code to new header file.
> 
> Jianbo Liu (4):
>   ixgbe: rearrange vector PMD code for x86
>   ixgbe: implement vector PMD for arm architecture
>   ixgbe: enable ixgbe vector PMD on ARMv8a platform
>   maintainers: claim responsibility for ixgbe vector PMD on ARM
> 
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/4] ixgbe: enable ixgbe vector PMD on ARM
  2016-05-24 16:10   ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Bruce Richardson
@ 2016-05-24 16:12     ` Bruce Richardson
  2016-05-27 10:44       ` Jianbo Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Bruce Richardson @ 2016-05-24 16:12 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, jerin.jacob, helin.zhang, konstantin.ananyev

On Tue, May 24, 2016 at 05:10:01PM +0100, Bruce Richardson wrote:
> On Fri, May 06, 2016 at 11:55:44AM +0530, Jianbo Liu wrote:
> > Implement ixgbe vPMD on ARM with NEON intrinsic.
> > 
> > v3:
> >  - rebase to rel_16_07 branch on dpdk-next-net.
> > 
> > v2:
> >  - move the common code to new header file.
> > 
> > Jianbo Liu (4):
> >   ixgbe: rearrange vector PMD code for x86
> >   ixgbe: implement vector PMD for arm architecture
> >   ixgbe: enable ixgbe vector PMD on ARMv8a platform
> >   maintainers: claim responsibility for ixgbe vector PMD on ARM
> > 
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> 
Applied to dpdk-next-net/rel_16_07

Jianbo, I've fixed some checkpatch issues in patch 2, and updated the NIC features
overview table as part of patch 3 when applying them. Please verify all is ok
with you on the 16.07 branch, since I don't have ARM platforms to check things
on.

Regards,
/Bruce

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture
  2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
  2016-05-10 14:49     ` Bruce Richardson
@ 2016-05-25 12:29     ` Jerin Jacob
  2016-05-25 12:53       ` Bruce Richardson
  2016-05-26  1:37       ` Jianbo Liu
  1 sibling, 2 replies; 25+ messages in thread
From: Jerin Jacob @ 2016-05-25 12:29 UTC (permalink / raw)
  To: Jianbo Liu; +Cc: dev, bruce.richardson, helin.zhang, konstantin.ananyev

On Fri, May 06, 2016 at 11:55:46AM +0530, Jianbo Liu wrote:
> use ARM NEON intrinsic to implement ixgbe vPMD
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  drivers/net/ixgbe/Makefile              |   4 +
>  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 561 ++++++++++++++++++++++++++++++++
>  2 files changed, 565 insertions(+)
>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> 
> diff --git a/drivers/net/ixgbe/Makefile b/drivers/net/ixgbe/Makefile
> index 50bf51c..b1c7a60 100644
> --- a/drivers/net/ixgbe/Makefile
> +++ b/drivers/net/ixgbe/Makefile
> @@ -108,7 +108,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_rxtx.c
>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_ethdev.c
>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_fdir.c
>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_pf.c
> +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> +SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec_neon.c
> +else
>  SRCS-$(CONFIG_RTE_IXGBE_INC_VECTOR) += ixgbe_rxtx_vec.c
> +endif
>  
>  ifeq ($(CONFIG_RTE_NIC_BYPASS),y)
>  SRCS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += ixgbe_bypass.c
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> new file mode 100644
> index 0000000..11a6115
> --- /dev/null
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> @@ -0,0 +1,561 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <rte_ethdev.h>
> +#include <rte_malloc.h>
> +
> +#include "ixgbe_ethdev.h"
> +#include "ixgbe_rxtx.h"
> +#include "ixgbe_rxtx_vec_common.h"
> +
> +#include <arm_neon.h>
> +
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +
> +static inline void
> +ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
> +{
> +	int i;
> +	uint16_t rx_id;
> +	volatile union ixgbe_adv_rx_desc *rxdp;
> +	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +	struct rte_mbuf *mb0, *mb1;
> +	uint64x2_t dma_addr0, dma_addr1;
> +	uint64x2_t zero = vdupq_n_u64(0);
> +	uint64_t paddr;
> +	uint8x8_t p;
> +
> +	rxdp = rxq->rx_ring + rxq->rxrearm_start;
> +
> +	/* Pull 'n' more MBUFs into the software ring */
> +	if (unlikely(rte_mempool_get_bulk(rxq->mb_pool,
> +					  (void *)rxep,
> +					  RTE_IXGBE_RXQ_REARM_THRESH) < 0)) {
> +		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
> +		    rxq->nb_rx_desc) {
> +			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
> +				rxep[i].mbuf = &rxq->fake_mbuf;
> +				vst1q_u64((uint64_t *)&rxdp[i].read,
> +					  zero);
> +			}
> +		}
> +		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
> +			RTE_IXGBE_RXQ_REARM_THRESH;
> +		return;
> +	}
> +
> +	p = vld1_u8((uint8_t *)&rxq->mbuf_initializer);
> +
> +	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
> +	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
> +		mb0 = rxep[0].mbuf;
> +		mb1 = rxep[1].mbuf;
> +
> +		/*
> +		 * Flush mbuf with pkt template.
> +		 * Data to be rearmed is 6 bytes long.
> +		 * Though, RX will overwrite ol_flags that are coming next
> +		 * anyway. So overwrite whole 8 bytes with one load:
> +		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
> +		 */
> +		vst1_u8((uint8_t *)&mb0->rearm_data, p);
> +		paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
> +		dma_addr0 = vsetq_lane_u64(paddr, zero, 0);
> +		/* flush desc with pa dma_addr */
> +		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
> +
> +		vst1_u8((uint8_t *)&mb1->rearm_data, p);
> +		paddr = mb1->buf_physaddr + RTE_PKTMBUF_HEADROOM;
> +		dma_addr1 = vsetq_lane_u64(paddr, zero, 0);
> +		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
> +	}
> +
> +	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
> +	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
> +		rxq->rxrearm_start = 0;
> +
> +	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;
> +
> +	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
> +			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
> +
> +	/* Update the tail pointer on the NIC */
> +	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
> +}
> +
> +/* Handling the offload flags (olflags) field takes computation
> + * time when receiving packets. Therefore we provide a flag to disable
> + * the processing of the olflags field when they are not needed. This
> + * gives improved performance, at the cost of losing the offload info
> + * in the received packet
> + */
> +#ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
> +
> +#define VTAG_SHIFT     (3)
> +
> +static inline void
> +desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
> +		  uint8x16_t staterr, struct rte_mbuf **rx_pkts)
> +{
> +	uint8x16_t ptype;
> +	uint8x16_t vtag;
> +
> +	union {
> +		uint8_t e[4];
> +		uint32_t word;
> +	} vol;
> +
> +	const uint8x16_t pkttype_msk = {
> +			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
> +			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
> +			0x00, 0x00, 0x00, 0x00,
> +			0x00, 0x00, 0x00, 0x00,
> +			0x00, 0x00, 0x00, 0x00};
> +
> +	const uint8x16_t rsstype_msk = {
> +			0x0F, 0x0F, 0x0F, 0x0F,
> +			0x00, 0x00, 0x00, 0x00,
> +			0x00, 0x00, 0x00, 0x00,
> +			0x00, 0x00, 0x00, 0x00};
> +
> +	const uint8x16_t rss_flags = {
> +			0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
> +			0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH,
> +			PKT_RX_RSS_HASH, 0, 0, 0,
> +			0, 0, 0, PKT_RX_FDIR};
> +
> +	ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0];
> +	ptype = vandq_u8(ptype, rsstype_msk);
> +	ptype = vqtbl1q_u8(rss_flags, ptype);
> +
> +	vtag = vshrq_n_u8(staterr, VTAG_SHIFT);
> +	vtag = vandq_u8(vtag, pkttype_msk);
> +	vtag = vorrq_u8(ptype, vtag);
> +
> +	vol.word = vgetq_lane_u32(vreinterpretq_u32_u8(vtag), 0);
> +
> +	rx_pkts[0]->ol_flags = vol.e[0];
> +	rx_pkts[1]->ol_flags = vol.e[1];
> +	rx_pkts[2]->ol_flags = vol.e[2];
> +	rx_pkts[3]->ol_flags = vol.e[3];
> +}
> +#else
> +#define desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, rx_pkts)
> +#endif
> +
> +/*
> + * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
> + *
> + * Notice:
> + * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> + * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
> + *   numbers of DD bit
> + * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> + * - don't support ol_flags for rss and csum err
> + */
> +
> +#define IXGBE_VPMD_DESC_DD_MASK		0x01010101
> +#define IXGBE_VPMD_DESC_EOP_MASK	0x02020202
> +
> +static inline uint16_t
> +_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> +		   uint16_t nb_pkts, uint8_t *split_packet)
> +{
> +	volatile union ixgbe_adv_rx_desc *rxdp;
> +	struct ixgbe_rx_entry *sw_ring;
> +	uint16_t nb_pkts_recd;
> +	int pos;
> +	uint64_t var;
> +	uint8x16_t shuf_msk = {
> +		0xFF, 0xFF,
> +		0xFF, 0xFF,  /* skip 32 bits pkt_type */
> +		12, 13,      /* octet 12~13, low 16 bits pkt_len */
> +		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
> +		12, 13,      /* octet 12~13, 16 bits data_len */
> +		14, 15,      /* octet 14~15, low 16 bits vlan_macip */
> +		4, 5, 6, 7  /* octet 4~7, 32bits rss */
> +		};
> +	uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,
> +				 rxq->crc_len, 0, 0, 0};
> +
> +	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
> +	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
> +
> +	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
> +	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
> +
> +	/* Just the act of getting into the function from the application is
> +	 * going to cost about 7 cycles
> +	 */
> +	rxdp = rxq->rx_ring + rxq->rx_tail;
> +
> +	rte_prefetch_non_temporal(rxdp);
> +
> +	/* See if we need to rearm the RX queue - gives the prefetch a bit
> +	 * of time to act
> +	 */
> +	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
> +		ixgbe_rxq_rearm(rxq);
> +
> +	/* Before we start moving massive data around, check to see if
> +	 * there is actually a packet available
> +	 */
> +	if (!(rxdp->wb.upper.status_error &
> +				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
> +		return 0;
> +
> +	/* Cache is empty -> need to scan the buffer rings, but first move
> +	 * the next 'n' mbufs into the cache
> +	 */
> +	sw_ring = &rxq->sw_ring[rxq->rx_tail];
> +
> +	/* A. load 4 packet in one loop
> +	 * B. copy 4 mbuf point from swring to rx_pkts
> +	 * C. calc the number of DD bits among the 4 packets
> +	 * [C*. extract the end-of-packet bit, if requested]
> +	 * D. fill info. from desc to mbuf
> +	 */
> +	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
> +			pos += RTE_IXGBE_DESCS_PER_LOOP,
> +			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
> +		uint64x2_t descs[RTE_IXGBE_DESCS_PER_LOOP];
> +		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
> +		uint8x16x2_t sterr_tmp1, sterr_tmp2;
> +		uint64x2_t mbp1, mbp2;
> +		uint8x16_t staterr;
> +		uint16x8_t tmp;
> +		uint32_t stat;
> +
> +		/* B.1 load 1 mbuf point */
> +		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
> +
> +		/* Read desc statuses backwards to avoid race condition */
> +		/* A.1 load 4 pkts desc */
> +		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
> +		rte_rmb();

Any specific reason to add rte_rmb() here, If there is no performance
drop then it makes sense to add before descs[3] uses it.i.e
at rte_compiler_barrier() place in x86 code.

> +
> +		/* B.2 copy 2 mbuf point into rx_pkts  */
> +		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
> +
> +		/* B.1 load 1 mbuf point */
> +		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
> +
> +		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
> +		/* B.1 load 2 mbuf point */
> +		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
> +		descs[0] =  vld1q_u64((uint64_t *)(rxdp));
> +
> +		/* B.2 copy 2 mbuf point into rx_pkts  */
> +		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
> +
> +		if (split_packet) {
> +			rte_prefetch_non_temporal(&rx_pkts[pos]->cacheline1);
> +			rte_prefetch_non_temporal(&rx_pkts[pos+1]->cacheline1);
> +			rte_prefetch_non_temporal(&rx_pkts[pos+2]->cacheline1);
> +			rte_prefetch_non_temporal(&rx_pkts[pos+3]->cacheline1);

replace with rte_mbuf_prefetch_part2 or equivalent

> +		}
> +
> +		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
> +		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
> +		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
> +
> +		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
> +		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
> +		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
> +
> +		/* C.1 4=>2 filter staterr info only */
> +		sterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]),
> +				      vreinterpretq_u8_u64(descs[3]));
> +		/* C.1 4=>2 filter staterr info only */
> +		sterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]),
> +				      vreinterpretq_u8_u64(descs[2]));
> +
> +		/* C.2 get 4 pkts staterr value  */
> +		staterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0];
> +		stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
> +
> +		/* set ol_flags with vlan packet type */
> +		desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr,
> +				  &rx_pkts[pos]);
> +
> +		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
> +		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
> +		pkt_mb4 = vreinterpretq_u8_u16(tmp);
> +		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
> +		pkt_mb3 = vreinterpretq_u8_u16(tmp);
> +
> +		/* D.3 copy final 3,4 data to rx_pkts */
> +		vst1q_u8((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
> +			 pkt_mb4);
> +		vst1q_u8((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
> +			 pkt_mb3);
> +
> +		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
> +		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
> +		pkt_mb2 = vreinterpretq_u8_u16(tmp);
> +		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
> +		pkt_mb1 = vreinterpretq_u8_u16(tmp);
> +
> +		/* C* extract and record EOP bit */
> +		if (split_packet) {
> +			/* and with mask to extract bits, flipping 1-0 */
> +			*(int *)split_packet = ~stat & IXGBE_VPMD_DESC_EOP_MASK;
> +
> +			split_packet += RTE_IXGBE_DESCS_PER_LOOP;
> +
> +			/* zero-out next pointers */
> +			rx_pkts[pos]->next = NULL;
> +			rx_pkts[pos + 1]->next = NULL;
> +			rx_pkts[pos + 2]->next = NULL;
> +			rx_pkts[pos + 3]->next = NULL;
> +		}
> +
> +		rte_prefetch_non_temporal(rxdp + RTE_IXGBE_DESCS_PER_LOOP);
> +
> +		/* D.3 copy final 1,2 data to rx_pkts */
> +		vst1q_u8((uint8_t *)&rx_pkts[pos+1]->rx_descriptor_fields1,
> +			 pkt_mb2);
> +		vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
> +			 pkt_mb1);
> +
> +		/* C.4 calc avaialbe number of desc */
> +		var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
> +		nb_pkts_recd += var;
> +		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
> +			break;
> +	}
> +
> +	/* Update our internal tail pointer */
> +	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
> +	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
> +	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
> +
> +	return nb_pkts_recd;
> +}
> +
> +/*
> + * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
> + *
> + * Notice:
> + * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> + * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
> + *   numbers of DD bit
> + * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> + * - don't support ol_flags for rss and csum err
> + */
> +uint16_t
> +ixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +		uint16_t nb_pkts)
> +{
> +	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
> +}
> +
> +/*
> + * vPMD receive routine that reassembles scattered packets
> + *
> + * Notice:
> + * - don't support ol_flags for rss and csum err
> + * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
> + * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
> + *   numbers of DD bit
> + * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
> + */
> +uint16_t
> +ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +		uint16_t nb_pkts)
> +{
> +	struct ixgbe_rx_queue *rxq = rx_queue;
> +	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
> +
> +	/* get some new buffers */
> +	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
> +			split_flags);
> +	if (nb_bufs == 0)
> +		return 0;
> +
> +	/* happy day case, full burst + no packets to be joined */
> +	const uint64_t *split_fl64 = (uint64_t *)split_flags;
> +	if (rxq->pkt_first_seg == NULL &&
> +			split_fl64[0] == 0 && split_fl64[1] == 0 &&
> +			split_fl64[2] == 0 && split_fl64[3] == 0)
> +		return nb_bufs;
> +
> +	/* reassemble any packets that need reassembly*/
> +	unsigned i = 0;
> +	if (rxq->pkt_first_seg == NULL) {
> +		/* find the first split flag, and only reassemble then*/
> +		while (i < nb_bufs && !split_flags[i])
> +			i++;
> +		if (i == nb_bufs)
> +			return nb_bufs;
> +	}
> +	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
> +		&split_flags[i]);
> +}
> +
> +static inline void
> +vtx1(volatile union ixgbe_adv_tx_desc *txdp,
> +		struct rte_mbuf *pkt, uint64_t flags)
> +{
> +	uint64x2_t descriptor = {
> +			pkt->buf_physaddr + pkt->data_off,
> +			(uint64_t)pkt->pkt_len << 46 | flags | pkt->data_len};
> +
> +	vst1q_u64((uint64_t *)&txdp->read, descriptor);
> +}
> +
> +static inline void
> +vtx(volatile union ixgbe_adv_tx_desc *txdp,
> +		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
> +{
> +	int i;
> +
> +	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
> +		vtx1(txdp, *pkt, flags);
> +}
> +
> +uint16_t
> +ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		       uint16_t nb_pkts)
> +{
> +	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
> +	volatile union ixgbe_adv_tx_desc *txdp;
> +	struct ixgbe_tx_entry_v *txep;
> +	uint16_t n, nb_commit, tx_id;
> +	uint64_t flags = DCMD_DTYP_FLAGS;
> +	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
> +	int i;
> +
> +	/* cross rx_thresh boundary is not allowed */
> +	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
> +
> +	if (txq->nb_tx_free < txq->tx_free_thresh)
> +		ixgbe_tx_free_bufs(txq);
> +
> +	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
> +	if (unlikely(nb_pkts == 0))
> +		return 0;
> +
> +	tx_id = txq->tx_tail;
> +	txdp = &txq->tx_ring[tx_id];
> +	txep = &txq->sw_ring_v[tx_id];
> +
> +	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
> +
> +	n = (uint16_t)(txq->nb_tx_desc - tx_id);
> +	if (nb_commit >= n) {
> +
> +		tx_backlog_entry(txep, tx_pkts, n);
> +
> +		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
> +			vtx1(txdp, *tx_pkts, flags);
> +
> +		vtx1(txdp, *tx_pkts++, rs);
> +
> +		nb_commit = (uint16_t)(nb_commit - n);
> +
> +		tx_id = 0;
> +		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +
> +		/* avoid reach the end of ring */
> +		txdp = &(txq->tx_ring[tx_id]);
> +		txep = &txq->sw_ring_v[tx_id];
> +	}
> +
> +	tx_backlog_entry(txep, tx_pkts, nb_commit);
> +
> +	vtx(txdp, tx_pkts, nb_commit, flags);
> +
> +	tx_id = (uint16_t)(tx_id + nb_commit);
> +	if (tx_id > txq->tx_next_rs) {
> +		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
> +			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
> +		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
> +			txq->tx_rs_thresh);
> +	}
> +
> +	txq->tx_tail = tx_id;
> +
> +	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
> +
> +	return nb_pkts;
> +}
> +
> +static void __attribute__((cold))
> +ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
> +{
> +	_ixgbe_tx_queue_release_mbufs_vec(txq);
> +}
> +
> +void __attribute__((cold))
> +ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
> +{
> +	_ixgbe_rx_queue_release_mbufs_vec(rxq);
> +}
> +
> +static void __attribute__((cold))
> +ixgbe_tx_free_swring(struct ixgbe_tx_queue *txq)
> +{
> +	_ixgbe_tx_free_swring_vec(txq);
> +}
> +
> +static void __attribute__((cold))
> +ixgbe_reset_tx_queue(struct ixgbe_tx_queue *txq)
> +{
> +	_ixgbe_reset_tx_queue_vec(txq);
> +}
> +
> +static const struct ixgbe_txq_ops vec_txq_ops = {
> +	.release_mbufs = ixgbe_tx_queue_release_mbufs_vec,
> +	.free_swring = ixgbe_tx_free_swring,
> +	.reset = ixgbe_reset_tx_queue,
> +};
> +
> +int __attribute__((cold))
> +ixgbe_rxq_vec_setup(struct ixgbe_rx_queue *rxq)
> +{
> +	return ixgbe_rxq_vec_setup_default(rxq);
> +}
> +
> +int __attribute__((cold))
> +ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq)
> +{
> +	return ixgbe_txq_vec_setup_default(txq, &vec_txq_ops);
> +}
> +
> +int __attribute__((cold))
> +ixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
> +{
> +	return ixgbe_rx_vec_dev_conf_condition_check_default(dev);
> +}
> -- 
> 2.4.11
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture
  2016-05-25 12:29     ` Jerin Jacob
@ 2016-05-25 12:53       ` Bruce Richardson
  2016-05-26  1:37       ` Jianbo Liu
  1 sibling, 0 replies; 25+ messages in thread
From: Bruce Richardson @ 2016-05-25 12:53 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Jianbo Liu, dev, helin.zhang, konstantin.ananyev

On Wed, May 25, 2016 at 05:59:38PM +0530, Jerin Jacob wrote:
> On Fri, May 06, 2016 at 11:55:46AM +0530, Jianbo Liu wrote:
> > use ARM NEON intrinsic to implement ixgbe vPMD
> > 
> > Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> > ---
> >  drivers/net/ixgbe/Makefile              |   4 +
> >  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 561 ++++++++++++++++++++++++++++++++
> >  2 files changed, 565 insertions(+)
> >  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> > 
<snip>
> > +	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
> > +			pos += RTE_IXGBE_DESCS_PER_LOOP,
> > +			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
> > +		uint64x2_t descs[RTE_IXGBE_DESCS_PER_LOOP];
> > +		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
> > +		uint8x16x2_t sterr_tmp1, sterr_tmp2;
> > +		uint64x2_t mbp1, mbp2;
> > +		uint8x16_t staterr;
> > +		uint16x8_t tmp;
> > +		uint32_t stat;
> > +
> > +		/* B.1 load 1 mbuf point */
> > +		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
> > +
> > +		/* Read desc statuses backwards to avoid race condition */
> > +		/* A.1 load 4 pkts desc */
> > +		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
> > +		rte_rmb();
> 
> Any specific reason to add rte_rmb() here, If there is no performance
> drop then it makes sense to add before descs[3] uses it.i.e
> at rte_compiler_barrier() place in x86 code.
> 
> > +
> > +		/* B.2 copy 2 mbuf point into rx_pkts  */
> > +		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
> > +
> > +		/* B.1 load 1 mbuf point */
> > +		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
> > +
> > +		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
> > +		/* B.1 load 2 mbuf point */
> > +		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
> > +		descs[0] =  vld1q_u64((uint64_t *)(rxdp));
> > +
> > +		/* B.2 copy 2 mbuf point into rx_pkts  */
> > +		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
> > +
> > +		if (split_packet) {
> > +			rte_prefetch_non_temporal(&rx_pkts[pos]->cacheline1);
> > +			rte_prefetch_non_temporal(&rx_pkts[pos+1]->cacheline1);
> > +			rte_prefetch_non_temporal(&rx_pkts[pos+2]->cacheline1);
> > +			rte_prefetch_non_temporal(&rx_pkts[pos+3]->cacheline1);
> 
> replace with rte_mbuf_prefetch_part2 or equivalent
> 
Hi Jerin, Jianbo,

since this patch has already been applied and these are not critical issues with
it, can a new patch please be submitted to propose these additional changes on
top of what's on next-net now.

Thanks,
/Bruce

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture
  2016-05-25 12:29     ` Jerin Jacob
  2016-05-25 12:53       ` Bruce Richardson
@ 2016-05-26  1:37       ` Jianbo Liu
  1 sibling, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-26  1:37 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, Bruce Richardson, Zhang, Helin, Ananyev, Konstantin

On 25 May 2016 at 20:29, Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> On Fri, May 06, 2016 at 11:55:46AM +0530, Jianbo Liu wrote:
>> use ARM NEON intrinsic to implement ixgbe vPMD
>>
>> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
>> ---
>>  drivers/net/ixgbe/Makefile              |   4 +
>>  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 561 ++++++++++++++++++++++++++++++++
>>  2 files changed, 565 insertions(+)
>>  create mode 100644 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c

>> +             /* Read desc statuses backwards to avoid race condition */
>> +             /* A.1 load 4 pkts desc */
>> +             descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
>> +             rte_rmb();
>
> Any specific reason to add rte_rmb() here, If there is no performance
> drop then it makes sense to add before descs[3] uses it.i.e
> at rte_compiler_barrier() place in x86 code.
>
To avoid desc statuses inconsistent since they are read backwards.

>> +
>> +             /* B.2 copy 2 mbuf point into rx_pkts  */
>> +             vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
>> +
>> +             /* B.1 load 1 mbuf point */
>> +             mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
>> +
>> +             descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
>> +             /* B.1 load 2 mbuf point */
>> +             descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
>> +             descs[0] =  vld1q_u64((uint64_t *)(rxdp));
>> +
>> +             /* B.2 copy 2 mbuf point into rx_pkts  */
>> +             vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
>> +
>> +             if (split_packet) {
>> +                     rte_prefetch_non_temporal(&rx_pkts[pos]->cacheline1);
>> +                     rte_prefetch_non_temporal(&rx_pkts[pos+1]->cacheline1);
>> +                     rte_prefetch_non_temporal(&rx_pkts[pos+2]->cacheline1);
>> +                     rte_prefetch_non_temporal(&rx_pkts[pos+3]->cacheline1);
>
> replace with rte_mbuf_prefetch_part2 or equivalent
>
rte_mbuf_prefetch_part2 is new functions after this patchset, so it's
better to submit a new patch as Bruce said.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/4] ixgbe: enable ixgbe vector PMD on ARM
  2016-05-24 16:12     ` Bruce Richardson
@ 2016-05-27 10:44       ` Jianbo Liu
  0 siblings, 0 replies; 25+ messages in thread
From: Jianbo Liu @ 2016-05-27 10:44 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev, Jerin Jacob, Zhang, Helin, Ananyev, Konstantin

On 25 May 2016 at 00:12, Bruce Richardson <bruce.richardson@intel.com> wrote:
> On Tue, May 24, 2016 at 05:10:01PM +0100, Bruce Richardson wrote:
>> On Fri, May 06, 2016 at 11:55:44AM +0530, Jianbo Liu wrote:
>> > Implement ixgbe vPMD on ARM with NEON intrinsic.
>> >
>> > v3:
>> >  - rebase to rel_16_07 branch on dpdk-next-net.
>> >
>> > v2:
>> >  - move the common code to new header file.
>> >
>> > Jianbo Liu (4):
>> >   ixgbe: rearrange vector PMD code for x86
>> >   ixgbe: implement vector PMD for arm architecture
>> >   ixgbe: enable ixgbe vector PMD on ARMv8a platform
>> >   maintainers: claim responsibility for ixgbe vector PMD on ARM
>> >
>> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
>>
> Applied to dpdk-next-net/rel_16_07
>
> Jianbo, I've fixed some checkpatch issues in patch 2, and updated the NIC features
> overview table as part of patch 3 when applying them. Please verify all is ok
> with you on the 16.07 branch, since I don't have ARM platforms to check things
> on.
>
Thanks Bruce.
No need to change that list. I have verified the ixgbe VF PMD and vPMD
on ARMv8a platform.

Jianbo

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-05-27 10:44 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-20 13:44 [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
2016-04-20 13:45 ` [dpdk-dev] [PATCH 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
2016-04-20 13:45 ` [dpdk-dev] [PATCH 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
2016-04-20 13:45 ` [dpdk-dev] [PATCH 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
2016-04-25 16:35 ` [dpdk-dev] [PATCH 1/4] ixgbe: rearrange vector PMD code for x86 Bruce Richardson
2016-04-26  8:23   ` Jianbo Liu
2016-04-26 13:50 ` [dpdk-dev] [PATCH v2 " Jianbo Liu
2016-05-03  5:51   ` Jianbo Liu
2016-05-03 16:29   ` Bruce Richardson
2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
2016-04-26 13:55 ` [dpdk-dev] [PATCH v2 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
2016-05-06  6:25 ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Jianbo Liu
2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 1/4] ixgbe: rearrange vector PMD code for x86 Jianbo Liu
2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 2/4] ixgbe: implement vector PMD for arm architecture Jianbo Liu
2016-05-10 14:49     ` Bruce Richardson
2016-05-11  2:40       ` Jianbo Liu
2016-05-25 12:29     ` Jerin Jacob
2016-05-25 12:53       ` Bruce Richardson
2016-05-26  1:37       ` Jianbo Liu
2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 3/4] ixgbe: enable ixgbe vector PMD on ARMv8a platform Jianbo Liu
2016-05-06  6:25   ` [dpdk-dev] [PATCH v3 4/4] maintainers: claim responsibility for ixgbe vector PMD on ARM Jianbo Liu
2016-05-24 16:10   ` [dpdk-dev] [PATCH v3 0/4] ixgbe: enable " Bruce Richardson
2016-05-24 16:12     ` Bruce Richardson
2016-05-27 10:44       ` Jianbo Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).