DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/3] net/i40e: add FDIR ID to vector rx
@ 2019-10-07  9:07 Harry van Haaren
  2019-10-07  9:07 ` [dpdk-dev] [PATCH 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-07  9:07 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

Hey All,

This series adds FDIR ID support to the vector Rx routines.
Support for both the SSE and AVX drivers is added, and both
the 16 byte and 32 byte descriptors are implemented.

A v1/RFC for this functionality was sent in 19.08 timeframe,
refer to the patch as archived on patchwork here:
http://patches.dpdk.org/patch/53969/

Regards, -Harry

Harry van Haaren (3):
  net/i40e: cache fdir enable value in rx queue
  net/i40e: add flow mark capability to SSE vector routine
  net/i40e: add flow director support to avx rx path

 drivers/net/i40e/i40e_ethdev.h        |   1 +
 drivers/net/i40e/i40e_fdir.c          |  20 +++
 drivers/net/i40e/i40e_flow.c          |   4 +
 drivers/net/i40e/i40e_rxtx.c          |   6 -
 drivers/net/i40e/i40e_rxtx.h          |   8 ++
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 188 +++++++++++++++++++++++++-
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 126 ++++++++++++++++-
 7 files changed, 338 insertions(+), 15 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [dpdk-dev] [PATCH 1/3] net/i40e: cache fdir enable value in rx queue
  2019-10-07  9:07 [dpdk-dev] [PATCH 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
@ 2019-10-07  9:07 ` Harry van Haaren
  2019-10-07  9:07 ` [dpdk-dev] [PATCH 2/3] net/i40e: add flow mark capability to SSE vector routine Harry van Haaren
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-07  9:07 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

This commit adds a fdir_enable flag in a uint8_t sized hole
the rx queue structure The flag enables the rx code path to
easily identify if fdir is active. This can be used to skip
fdir id processing when it is not required.

The flag is zero by default (as rxq is zmalloc-ed at startup),
and the flag is set to 1 on configuration of a flow director rule.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 drivers/net/i40e/i40e_ethdev.h |  1 +
 drivers/net/i40e/i40e_fdir.c   | 20 ++++++++++++++++++++
 drivers/net/i40e/i40e_flow.c   |  4 ++++
 drivers/net/i40e/i40e_rxtx.h   |  1 +
 4 files changed, 26 insertions(+)

diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 38ac3ead6..743bead25 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -1153,6 +1153,7 @@ const struct rte_memzone *i40e_memzone_reserve(const char *name,
 					uint32_t len,
 					int socket_id);
 int i40e_fdir_configure(struct rte_eth_dev *dev);
+void i40e_fdir_rx_proc_enable(struct rte_eth_dev *dev, bool on);
 void i40e_fdir_teardown(struct i40e_pf *pf);
 enum i40e_filter_pctype
 	i40e_flowtype_to_pctype(const struct i40e_adapter *adapter,
diff --git a/drivers/net/i40e/i40e_fdir.c b/drivers/net/i40e/i40e_fdir.c
index b3e893a5b..dee007daa 100644
--- a/drivers/net/i40e/i40e_fdir.c
+++ b/drivers/net/i40e/i40e_fdir.c
@@ -608,6 +608,23 @@ i40e_set_flex_mask_on_pctype(struct i40e_pf *pf,
 	}
 }
 
+/*
+ * Enable/disable flow director RX processing in vector routines.
+ */
+void
+i40e_fdir_rx_proc_enable(struct rte_eth_dev *dev, bool on)
+{
+	int32_t i;
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		struct i40e_rx_queue *rxq = dev->data->rx_queues[i];
+		if (!rxq)
+			continue;
+		rxq->fdir_enabled = on;
+	}
+	PMD_DRV_LOG(DEBUG, "Flow Director processing on RX set to %d", on);
+}
+
 /*
  * Configure flow director related setting
  */
@@ -675,6 +692,9 @@ i40e_fdir_configure(struct rte_eth_dev *dev)
 		PMD_DRV_LOG(ERR, "Not support flexible payload.");
 	}
 
+	/* Enable FDIR processing in RX routines */
+	i40e_fdir_rx_proc_enable(dev, 1);
+
 	return ret;
 }
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index e902a35d7..9e038fa48 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4775,6 +4775,7 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
 			i40e_fdir_teardown(pf);
 			dev->data->dev_conf.fdir_conf.mode =
 				   RTE_FDIR_MODE_NONE;
+			i40e_fdir_rx_proc_enable(dev, 0);
 		}
 		break;
 	case RTE_ETH_FILTER_HASH:
@@ -4931,6 +4932,9 @@ i40e_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
 		return -rte_errno;
 	}
 
+	/* Disable FDIR processing as all FDIR rules are now flushed */
+	i40e_fdir_rx_proc_enable(dev, 0);
+
 	return ret;
 }
 
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 3fc619af9..1028e8b68 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -96,6 +96,7 @@ struct i40e_rx_queue {
 
 	uint16_t port_id; /**< device port ID */
 	uint8_t crc_len; /**< 0 if CRC stripped, 4 otherwise */
+	uint8_t fdir_enabled; /**< 0 if FDIR disabled, 1 when enabled */
 	uint16_t queue_id; /**< RX queue index */
 	uint16_t reg_idx; /**< RX queue register index */
 	uint8_t drop_en; /**< if not 0, set register bit */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [dpdk-dev] [PATCH 2/3] net/i40e: add flow mark capability to SSE vector routine
  2019-10-07  9:07 [dpdk-dev] [PATCH 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
  2019-10-07  9:07 ` [dpdk-dev] [PATCH 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
@ 2019-10-07  9:07 ` Harry van Haaren
  2019-10-07  9:07 ` [dpdk-dev] [PATCH 3/3] net/i40e: add flow director support to avx rx path Harry van Haaren
  2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
  3 siblings, 0 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-07  9:07 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

This commit adds an implementation to the SSE vector implementation of
RX routine and moves some common defines from a c file to the header file.

I40e can have 16 and 32 byte descriptors, and the Flow
Director ID data and indication-bit are in different locations
for each size descriptor. The support is implemented in two
seperate functions as they require vastly different operations.

The 16B descriptor re-purposes the "filter-status" u32 field
to indicate FDIR ID when the FLM bit is set. No extra loads
are required, however we do have to store to mbuf->fdir.hi,
which is not stored to in the RX path before this patch.

The 32B descriptor requires loading the 2nd 16 bytes of each
descriptor, to get the FLEXBH_STAT and FD Filter ID from
qword3. The resulting data must also be stored to mbuf->fdir.hi,
same as the 16B code path.

Suggested-by: Mesut Ergin <mesut.a.ergin@intel.com>
Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c         |   6 --
 drivers/net/i40e/i40e_rxtx.h         |   7 ++
 drivers/net/i40e/i40e_rxtx_vec_sse.c | 126 +++++++++++++++++++++++++--
 3 files changed, 128 insertions(+), 11 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 692c3bab4..5fffb34ee 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -172,12 +172,6 @@ i40e_get_iee15888_flags(struct rte_mbuf *mb, uint64_t qword)
 }
 #endif
 
-#define I40E_RX_DESC_EXT_STATUS_FLEXBH_MASK   0x03
-#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FD_ID  0x01
-#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FLEX   0x02
-#define I40E_RX_DESC_EXT_STATUS_FLEXBL_MASK   0x03
-#define I40E_RX_DESC_EXT_STATUS_FLEXBL_FLEX   0x01
-
 static inline uint64_t
 i40e_rxd_build_fdir(volatile union i40e_rx_desc *rxdp, struct rte_mbuf *mb)
 {
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 1028e8b68..2106bb355 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -32,6 +32,13 @@
 
 #define I40E_TX_MIN_PKT_LEN 17
 
+/* Shared FDIR masks between scalar / vector drivers */
+#define I40E_RX_DESC_EXT_STATUS_FLEXBH_MASK   0x03
+#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FD_ID  0x01
+#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FLEX   0x02
+#define I40E_RX_DESC_EXT_STATUS_FLEXBL_MASK   0x03
+#define I40E_RX_DESC_EXT_STATUS_FLEXBL_FLEX   0x01
+
 #undef container_of
 #define container_of(ptr, type, member) ({ \
 		typeof(((type *)0)->member)(*__mptr) = (ptr); \
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 1fc66b781..73ccf2d2d 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -89,9 +89,111 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 }
 
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+/* SSE version of FDIR mark extraction for 4 32B descriptors at a time */
+static inline __m128i
+descs_to_fdir_32b(volatile union i40e_rx_desc *rxdp, struct rte_mbuf **rx_pkt)
+{
+	/* 32B descriptors: Load 2nd half of descriptors for FDIR ID data */
+	__m128i desc0_qw23, desc1_qw23, desc2_qw23, desc3_qw23;
+	desc0_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 0)->wb.qword2);
+	desc1_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 1)->wb.qword2);
+	desc2_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 2)->wb.qword2);
+	desc3_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 3)->wb.qword2);
+
+	/* FDIR ID data: move last u32 of each desc to 4 u32 lanes */
+	__m128i v_unpack_01, v_unpack_23;
+	v_unpack_01 = _mm_unpackhi_epi32(desc0_qw23, desc1_qw23);
+	v_unpack_23 = _mm_unpackhi_epi32(desc2_qw23, desc3_qw23);
+	__m128i v_fdir_ids = _mm_unpackhi_epi64(v_unpack_01, v_unpack_23);
+
+	/* Extended Status: extract from each lower 32 bits, to u32 lanes */
+	v_unpack_01 = _mm_unpacklo_epi32(desc0_qw23, desc1_qw23);
+	v_unpack_23 = _mm_unpacklo_epi32(desc2_qw23, desc3_qw23);
+	__m128i v_flt_status = _mm_unpacklo_epi64(v_unpack_01, v_unpack_23);
+
+	/* Shift u32 left and right to "mask away" bits not required.
+	 * Data required is 4:5 (zero based), so left shift by 26 (32-6)
+	 * and then right shift by 30 (32 - 2 bits required).
+	 */
+	v_flt_status = _mm_slli_epi32(v_flt_status, 26);
+	v_flt_status = _mm_srli_epi32(v_flt_status, 30);
+
+	/* Generate constant 1 in all u32 lanes and compare */
+	RTE_BUILD_BUG_ON(I40E_RX_DESC_EXT_STATUS_FLEXBH_FD_ID != 1);
+	__m128i v_zeros = _mm_setzero_si128();
+	__m128i v_ffff = _mm_cmpeq_epi32(v_zeros, v_zeros);
+	__m128i v_u32_one = _mm_srli_epi32(v_ffff, 31);
+
+	/* per desc mask, bits set if FDIR ID is valid */
+	__m128i v_fd_id_mask = _mm_cmpeq_epi32(v_flt_status, v_u32_one);
+
+	/* Mask ID data to zero if the FD_ID bit not set in desc */
+	v_fdir_ids = _mm_and_si128(v_fdir_ids, v_fd_id_mask);
+
+	/* Extract and store as u32. No advantage to combining into SSE
+	 * stores, there are no surrounding stores to around fdir.hi
+	 */
+	rx_pkt[0]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 0);
+	rx_pkt[1]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 1);
+	rx_pkt[2]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 2);
+	rx_pkt[3]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 3);
+
+	/* convert fdir_id_mask into a single bit, then shift as required for
+	 * correct location in the mbuf->olflags
+	 */
+	const uint32_t FDIR_ID_BIT_SHIFT = 13;
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << FDIR_ID_BIT_SHIFT));
+	v_fd_id_mask = _mm_srli_epi32(v_fd_id_mask, 31);
+	v_fd_id_mask = _mm_slli_epi32(v_fd_id_mask, FDIR_ID_BIT_SHIFT);
+
+	/* The returned value must be combined into each mbuf. This is already
+	 * being done for RSS and VLAN mbuf olflags, so return bits to OR in.
+	 */
+	return v_fd_id_mask;
+}
+
+#else /* 32 or 16B FDIR ID handling */
+
+/* Handle 16B descriptor FDIR ID flag setting based on FLM. See scalar driver
+ * for scalar implementation of the same functionality.
+ */
+static inline __m128i
+descs_to_fdir_16b(__m128i fltstat, __m128i descs[4], struct rte_mbuf **rx_pkt)
+{
+	/* unpack filter-status data from descriptors */
+	__m128i v_tmp_01 = _mm_unpacklo_epi32(descs[0], descs[1]);
+	__m128i v_tmp_23 = _mm_unpacklo_epi32(descs[2], descs[3]);
+	__m128i v_fdir_ids = _mm_unpackhi_epi64(v_tmp_01, v_tmp_23);
+
+	/* Generate one bit in each u32 lane */
+	__m128i v_zeros = _mm_setzero_si128();
+	__m128i v_ffff = _mm_cmpeq_epi32(v_zeros, v_zeros);
+	__m128i v_111_mask = _mm_srli_epi32(v_ffff, 29);
+	__m128i v_11_mask = _mm_srli_epi32(v_ffff, 30);
+
+	/* Compare and mask away FDIR ID data if bit not set */
+	__m128i v_u32_bits = _mm_and_si128(v_111_mask, fltstat);
+	__m128i v_fdir_id_mask = _mm_cmpeq_epi32(v_u32_bits, v_11_mask);
+	v_fdir_ids = _mm_and_si128(v_fdir_id_mask, v_fdir_ids);
+
+	/* Store data to fdir.hi in mbuf */
+	rx_pkt[0]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 0);
+	rx_pkt[1]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 1);
+	rx_pkt[2]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 2);
+	rx_pkt[3]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 3);
+
+	/* Shift to 1 or 0 bit per u32 lane, then to PKT_RX_FDIR_ID offset */
+	const uint32_t FDIR_ID_BIT_SHIFT = 13;
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << FDIR_ID_BIT_SHIFT));
+	__m128i v_mask_one_bit = _mm_srli_epi32(v_fdir_id_mask, 31);
+	return _mm_slli_epi32(v_mask_one_bit, FDIR_ID_BIT_SHIFT);
+}
+#endif
+
 static inline void
-desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
-	struct rte_mbuf **rx_pkts)
+desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile union i40e_rx_desc *rxdp,
+		  __m128i descs[4], struct rte_mbuf **rx_pkts)
 {
 	const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
 	__m128i rearm0, rearm1, rearm2, rearm3;
@@ -143,6 +245,7 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
 			PKT_RX_IP_CKSUM_BAD >> 1,
 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
 
+	/* Unpack "status" from quadword 1, bits 0:32 */
 	vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
 	vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]);
 	vlan0 = _mm_unpacklo_epi64(vlan0, vlan1);
@@ -150,8 +253,8 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
 	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
 	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);
 
-	rss = _mm_srli_epi32(vlan1, 11);
-	rss = _mm_shuffle_epi8(rss_flags, rss);
+	const __m128i desc_fltstat = _mm_srli_epi32(vlan1, 11);
+	rss = _mm_shuffle_epi8(rss_flags, desc_fltstat);
 
 	l3_l4e = _mm_srli_epi32(vlan1, 22);
 	l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
@@ -163,6 +266,19 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
 	vlan0 = _mm_or_si128(vlan0, rss);
 	vlan0 = _mm_or_si128(vlan0, l3_l4e);
 
+	/* Extract FDIR ID only if FDIR is enabled to avoid useless work */
+	if (rxq->fdir_enabled) {
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+		__m128i v_fdir_ol_flags = descs_to_fdir_32b(rxdp, rx_pkts);
+#else
+		(void)rxdp; /* rxdp not required for 16B desc mode */
+		__m128i v_fdir_ol_flags = descs_to_fdir_16b(desc_fltstat,
+							    descs, rx_pkts);
+#endif
+		/* OR in ol_flag bits after descriptor speicific extraction */
+		vlan0 = _mm_or_si128(vlan0, v_fdir_ol_flags);
+	}
+
 	/*
 	 * At this point, we have the 4 sets of flags in the low 16-bits
 	 * of each 32-bit value in vlan0.
@@ -377,7 +493,7 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		/* C.1 4=>2 filter staterr info only */
 		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
 
-		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
+		desc_to_olflags_v(rxq, rxdp, descs, &rx_pkts[pos]);
 
 		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
 		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [dpdk-dev] [PATCH 3/3] net/i40e: add flow director support to avx rx path
  2019-10-07  9:07 [dpdk-dev] [PATCH 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
  2019-10-07  9:07 ` [dpdk-dev] [PATCH 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
  2019-10-07  9:07 ` [dpdk-dev] [PATCH 2/3] net/i40e: add flow mark capability to SSE vector routine Harry van Haaren
@ 2019-10-07  9:07 ` Harry van Haaren
  2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
  3 siblings, 0 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-07  9:07 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

This commit adds FDIR ID support to the AVX2 based recieve
path routine. Support for both 16B and 32B descriptors is
implemented.

Suggested-by: Mesut Ergin <mesut.a.ergin@intel.com>
Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 188 +++++++++++++++++++++++++-
 1 file changed, 184 insertions(+), 4 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 6f3278960..1c70e564e 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -137,9 +137,90 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 }
 
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+/* Handles 32B descriptor FDIR ID processing:
+ * rxdp: recieve descriptor ring, required to load 2nd 16B half of each desc
+ * rx_pkts: required to store metadata back to mbufs
+ * pkt_idx: offset into the burst, increments in vector widths
+ * desc_idx: required to select the correct shift at compile time
+ */
+static inline __m256i
+desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
+			 struct rte_mbuf **rx_pkts,
+			 const uint32_t pkt_idx,
+			 const uint32_t desc_idx)
+{
+	/* 32B desc path: load rxdp.wb.qword2 for EXT_STATUS and FLEXBH_STAT */
+	__m128i *rxdp_desc_0 = (void *)(&rxdp[desc_idx + 0].wb.qword2);
+	__m128i *rxdp_desc_1 = (void *)(&rxdp[desc_idx + 1].wb.qword2);
+	const __m128i desc_qw2_0 = _mm_load_si128(rxdp_desc_0);
+	const __m128i desc_qw2_1 = _mm_load_si128(rxdp_desc_1);
+
+	/* Mask for FLEXBH_STAT, and the FDIR_ID value to compare against. The
+	 * remaining data is set to all 1's to pass through data.
+	 */
+	const __m256i flexbh_mask = _mm256_set_epi32(-1, -1, -1, 3 << 4,
+						     -1, -1, -1, 3 << 4);
+	const __m256i flexbh_id   = _mm256_set_epi32(-1, -1, -1, 1 << 4,
+						     -1, -1, -1, 1 << 4);
+
+	/* Load descriptor, check for FLEXBH bits, generate a mask for both
+	 * packets in the register.
+	 */
+	__m256i desc_qw2_0_1 =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(desc_qw2_0),
+					desc_qw2_1, 1);
+	__m256i desc_tmp_msk = _mm256_and_si256(flexbh_mask, desc_qw2_0_1);
+	__m256i fdir_mask = _mm256_cmpeq_epi32(flexbh_id, desc_tmp_msk);
+	__m256i fdir_data = _mm256_alignr_epi8(desc_qw2_0_1, desc_qw2_0_1, 12);
+	__m256i desc_fdir_data = _mm256_and_si256(fdir_mask, fdir_data);
+
+	/* Write data out to the mbuf. There is no store to this area of the
+	 * mbuf today, so we cannot combine it with another store.
+	 */
+	const uint32_t idx_0 = pkt_idx + desc_idx;
+	const uint32_t idx_1 = pkt_idx + desc_idx + 1;
+	rx_pkts[idx_0]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 0);
+	rx_pkts[idx_1]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 4);
+
+	/* Create mbuf flags as required for mbuf_flags layout
+	 *  (That's high lane [1,3,5,7, 0,2,4,6] as u32 lanes).
+	 * Approach:
+	 * - Mask away bits not required from the fdir_mask
+	 * - Leave the PKT_FDIR_ID bit (1 << 13)
+	 * - Position that bit correctly based on packet number
+	 * - OR in the resulting bit to mbuf_flags
+	 */
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+	__m256i mbuf_flag_mask = _mm256_set_epi32(0, 0, 0, 1 << 13,
+						  0, 0, 0, 1 << 13);
+	__m256i desc_flag_bit =  _mm256_and_si256(mbuf_flag_mask, fdir_mask);
+
+	/* For static-inline function, this will be stripped out
+	 * as the desc_idx is a hard-coded constant.
+	 */
+	switch (desc_idx) {
+	case 0:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  4);
+	case 2:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  8);
+	case 4:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 12);
+	case 6:
+		return desc_flag_bit;
+	default:
+		break;
+	}
+
+	/* NOT REACHED, see above switch returns */
+	return _mm256_setzero_si256();
+}
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
 #define PKTLEN_SHIFT     10
 
-static inline uint16_t
+/* Force inline as some compilers will not inline by default. */
+static __rte_always_inline uint16_t
 _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts, uint8_t *split_packet)
 {
@@ -419,8 +500,10 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		/* set vlan and rss flags */
 		const __m256i vlan_flags = _mm256_shuffle_epi8(
 				vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags = _mm256_shuffle_epi8(
-				rss_flags_shuf, _mm256_srli_epi32(flag_bits, 11));
+		const __m256i rss_fdir_bits = _mm256_srli_epi32(flag_bits, 11);
+		const __m256i rss_flags = _mm256_shuffle_epi8(rss_flags_shuf,
+							      rss_fdir_bits);
+
 		/*
 		 * l3_l4_error flags, shuffle, then shift to correct adjustment
 		 * of flags in flags_shuf, and finally mask out extra bits
@@ -431,8 +514,105 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
 
 		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
 				_mm256_or_si256(rss_flags, vlan_flags));
+
+		/* If the rxq has FDIR enabled, read and process the FDIR info
+		 * from the descriptor. This can cause more loads/stores, so is
+		 * not always performed. Branch over the code when not enabled.
+		 */
+		if (rxq->fdir_enabled) {
+#ifdef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+			/* 16B descriptor code path:
+			 * RSS and FDIR ID use the same offset in the desc, so
+			 * only one can be present at a time. The code below
+			 * identifies an FDIR ID match, and zeros the RSS value
+			 * in the mbuf on FDIR match to keep mbuf data clean.
+			 */
+
+			/* Flags:
+			 * - Take flags, shift bits to null out
+			 * - CMPEQ with known FDIR ID, to get 0xFFFF or 0 mask
+			 * - Strip bits from mask, leaving 0 or 1 for FDIR ID
+			 * - Merge with mbuf_flags
+			 */
+			/* FLM = 1, FLTSTAT = 0b01, (FLM | FLTSTAT) == 3.
+			 * Shift left by 28 to avoid having to mask.
+			 */
+			const __m256i fdir = _mm256_slli_epi32(rss_fdir_bits, 28);
+			const __m256i fdir_id = _mm256_set1_epi32(3 << 28);
+
+			/* As above, the fdir_mask to packet mapping is this:
+			 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+			 * Then OR FDIR flags to mbuf_flags on FDIR ID hit.
+			 */
+			RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+			const __m256i pkt_fdir_bit = _mm256_set1_epi32(1 << 13);
+			const __m256i fdir_mask = _mm256_cmpeq_epi32(fdir, fdir_id);
+			__m256i fdir_bits = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_bits);
+
+			/* Based on FDIR_MASK, clear the RSS or FDIR value.
+			 * The FDIR ID value is masked to zero if not a hit,
+			 * otherwise the mb0_1 register RSS field is zeroed.
+			 */
+			const __m256i fdir_zero_mask = _mm256_setzero_si256();
+			__m256i tmp0_1 = _mm256_or_si256(fdir_mask, fdir_zero_mask);
+			__m256i fdir_mb0_1 = _mm256_and_si256(mb0_1, fdir_mask);
+			mb0_1 = _mm256_andnot_si256(tmp0_1, mb0_1);
+
+			/* Write to mbuf: no stores to combine with, so just a
+			 * scalar store to push data here.
+			 */
+			rx_pkts[i + 0]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb0_1, 3);
+			rx_pkts[i + 1]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb0_1, 7);
+
+			/* Same as above, only shift the fdir_mask to align
+			 * the packet FDIR mask with the FDIR_ID desc lane.
+			 */
+			__m256i tmp2_3 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 12);
+			__m256i fdir_mb2_3 = _mm256_and_si256(mb2_3, tmp2_3);
+			tmp2_3 = _mm256_or_si256(tmp2_3, fdir_zero_mask);
+			mb2_3 = _mm256_andnot_si256(tmp2_3, mb2_3);
+			rx_pkts[i + 2]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb2_3, 3);
+			rx_pkts[i + 3]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb2_3, 7);
+
+			__m256i tmp4_5 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 8);
+			__m256i fdir_mb4_5 = _mm256_and_si256(mb4_5, tmp4_5);
+			tmp4_5 = _mm256_or_si256(tmp4_5, fdir_zero_mask);
+			mb4_5 = _mm256_andnot_si256(tmp4_5, mb4_5);
+			rx_pkts[i + 4]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb4_5, 3);
+			rx_pkts[i + 5]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb4_5, 7);
+
+			__m256i tmp6_7 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 4);
+			__m256i fdir_mb6_7 = _mm256_and_si256(mb6_7, tmp6_7);
+			tmp6_7 = _mm256_or_si256(tmp6_7, fdir_zero_mask);
+			mb6_7 = _mm256_andnot_si256(tmp6_7, mb6_7);
+			rx_pkts[i + 6]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb6_7, 3);
+			rx_pkts[i + 7]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb6_7, 7);
+
+			/* End of 16B descriptor handling */
+#else
+			/* 32B descriptor FDIR ID mark handling. Returns bits
+			 * to be OR-ed into the mbuf olflags.
+			 */
+			__m256i fdir_add_flags;
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 0);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 2);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 4);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 6);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+			/* End 32B desc handling */
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+		} /* if() on FDIR enabled */
+
 		/*
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx
  2019-10-07  9:07 [dpdk-dev] [PATCH 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
                   ` (2 preceding siblings ...)
  2019-10-07  9:07 ` [dpdk-dev] [PATCH 3/3] net/i40e: add flow director support to avx rx path Harry van Haaren
@ 2019-10-09 15:20 ` Harry van Haaren
  2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
                     ` (4 more replies)
  3 siblings, 5 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-09 15:20 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

v2:
- Improve RSS clearning in SSE 16B descriptor
- Fix AVX 16B descriptor data handling

---

Hey All,

This series adds FDIR ID support to the vector Rx routines.
Support for both the SSE and AVX drivers is added, and both
the 16 byte and 32 byte descriptors are implemented.

A v1/RFC for this functionality was sent in 19.08 timeframe,
refer to the patch as archived on patchwork here:
http://patches.dpdk.org/patch/53969/

Regards, -Harry

Harry van Haaren (3):
  net/i40e: cache fdir enable value in rx queue
  net/i40e: add flow mark capability to SSE vector routine
  net/i40e: add flow director support to avx rx path

 drivers/net/i40e/i40e_ethdev.h        |   1 +
 drivers/net/i40e/i40e_fdir.c          |  20 +++
 drivers/net/i40e/i40e_flow.c          |   4 +
 drivers/net/i40e/i40e_rxtx.c          |   6 -
 drivers/net/i40e/i40e_rxtx.h          |   8 ++
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 193 +++++++++++++++++++++++++-
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 154 ++++++++++++++++++--
 7 files changed, 367 insertions(+), 19 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [dpdk-dev] [PATCH v2 1/3] net/i40e: cache fdir enable value in rx queue
  2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
@ 2019-10-09 15:20   ` Harry van Haaren
  2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 2/3] net/i40e: add flow mark capability to SSE vector routine Harry van Haaren
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-09 15:20 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

This commit adds a fdir_enable flag in a uint8_t sized hole
the rx queue structure The flag enables the rx code path to
easily identify if fdir is active. This can be used to skip
fdir id processing when it is not required.

The flag is zero by default (as rxq is zmalloc-ed at startup),
and the flag is set to 1 on configuration of a flow director rule.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

v2:
- Disable FDIR processing on flush (Qi)
- Disable FDIR processing on last rule deletion (Qi)
- Moved enable/disable to seperate function to avoid code duplication
- Added PMD_LOG INFO level print for FDIR RX Processing enable/disable
---
 drivers/net/i40e/i40e_ethdev.h |  1 +
 drivers/net/i40e/i40e_fdir.c   | 20 ++++++++++++++++++++
 drivers/net/i40e/i40e_flow.c   |  4 ++++
 drivers/net/i40e/i40e_rxtx.h   |  1 +
 4 files changed, 26 insertions(+)

diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 38ac3ead6..743bead25 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -1153,6 +1153,7 @@ const struct rte_memzone *i40e_memzone_reserve(const char *name,
 					uint32_t len,
 					int socket_id);
 int i40e_fdir_configure(struct rte_eth_dev *dev);
+void i40e_fdir_rx_proc_enable(struct rte_eth_dev *dev, bool on);
 void i40e_fdir_teardown(struct i40e_pf *pf);
 enum i40e_filter_pctype
 	i40e_flowtype_to_pctype(const struct i40e_adapter *adapter,
diff --git a/drivers/net/i40e/i40e_fdir.c b/drivers/net/i40e/i40e_fdir.c
index b3e893a5b..dee007daa 100644
--- a/drivers/net/i40e/i40e_fdir.c
+++ b/drivers/net/i40e/i40e_fdir.c
@@ -608,6 +608,23 @@ i40e_set_flex_mask_on_pctype(struct i40e_pf *pf,
 	}
 }
 
+/*
+ * Enable/disable flow director RX processing in vector routines.
+ */
+void
+i40e_fdir_rx_proc_enable(struct rte_eth_dev *dev, bool on)
+{
+	int32_t i;
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		struct i40e_rx_queue *rxq = dev->data->rx_queues[i];
+		if (!rxq)
+			continue;
+		rxq->fdir_enabled = on;
+	}
+	PMD_DRV_LOG(DEBUG, "Flow Director processing on RX set to %d", on);
+}
+
 /*
  * Configure flow director related setting
  */
@@ -675,6 +692,9 @@ i40e_fdir_configure(struct rte_eth_dev *dev)
 		PMD_DRV_LOG(ERR, "Not support flexible payload.");
 	}
 
+	/* Enable FDIR processing in RX routines */
+	i40e_fdir_rx_proc_enable(dev, 1);
+
 	return ret;
 }
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index e902a35d7..9e038fa48 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4775,6 +4775,7 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
 			i40e_fdir_teardown(pf);
 			dev->data->dev_conf.fdir_conf.mode =
 				   RTE_FDIR_MODE_NONE;
+			i40e_fdir_rx_proc_enable(dev, 0);
 		}
 		break;
 	case RTE_ETH_FILTER_HASH:
@@ -4931,6 +4932,9 @@ i40e_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
 		return -rte_errno;
 	}
 
+	/* Disable FDIR processing as all FDIR rules are now flushed */
+	i40e_fdir_rx_proc_enable(dev, 0);
+
 	return ret;
 }
 
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 3fc619af9..1028e8b68 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -96,6 +96,7 @@ struct i40e_rx_queue {
 
 	uint16_t port_id; /**< device port ID */
 	uint8_t crc_len; /**< 0 if CRC stripped, 4 otherwise */
+	uint8_t fdir_enabled; /**< 0 if FDIR disabled, 1 when enabled */
 	uint16_t queue_id; /**< RX queue index */
 	uint16_t reg_idx; /**< RX queue register index */
 	uint8_t drop_en; /**< if not 0, set register bit */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [dpdk-dev] [PATCH v2 2/3] net/i40e: add flow mark capability to SSE vector routine
  2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
  2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
@ 2019-10-09 15:20   ` Harry van Haaren
  2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 3/3] net/i40e: add flow director support to avx rx path Harry van Haaren
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-09 15:20 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

This commit adds an implementation to the SSE vector implementation of
RX routine and moves some common defines from a c file to the header file.

I40e can have 16 and 32 byte descriptors, and the Flow
Director ID data and indication-bit are in different locations
for each size descriptor. The support is implemented in two
seperate functions as they require vastly different operations.

The 16B descriptor re-purposes the "filter-status" u32 field
to indicate FDIR ID when the FLM bit is set. No extra loads
are required, however we do have to store to mbuf->fdir.hi,
which is not stored to in the RX path before this patch.

The 32B descriptor requires loading the 2nd 16 bytes of each
descriptor, to get the FLEXBH_STAT and FD Filter ID from
qword3. The resulting data must also be stored to mbuf->fdir.hi,
same as the 16B code path.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

v2;
- Fix unused variable warning on 16B desc path
- Fix RSS clearing for 16B descs. Note, this requires delaying the
  desc[] to mbuf shuffle, to clear the RSS data while in desc[] array.
---
 drivers/net/i40e/i40e_rxtx.c         |   6 --
 drivers/net/i40e/i40e_rxtx.h         |   7 ++
 drivers/net/i40e/i40e_rxtx_vec_sse.c | 154 +++++++++++++++++++++++++--
 3 files changed, 152 insertions(+), 15 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 692c3bab4..5fffb34ee 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -172,12 +172,6 @@ i40e_get_iee15888_flags(struct rte_mbuf *mb, uint64_t qword)
 }
 #endif
 
-#define I40E_RX_DESC_EXT_STATUS_FLEXBH_MASK   0x03
-#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FD_ID  0x01
-#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FLEX   0x02
-#define I40E_RX_DESC_EXT_STATUS_FLEXBL_MASK   0x03
-#define I40E_RX_DESC_EXT_STATUS_FLEXBL_FLEX   0x01
-
 static inline uint64_t
 i40e_rxd_build_fdir(volatile union i40e_rx_desc *rxdp, struct rte_mbuf *mb)
 {
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 1028e8b68..2106bb355 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -32,6 +32,13 @@
 
 #define I40E_TX_MIN_PKT_LEN 17
 
+/* Shared FDIR masks between scalar / vector drivers */
+#define I40E_RX_DESC_EXT_STATUS_FLEXBH_MASK   0x03
+#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FD_ID  0x01
+#define I40E_RX_DESC_EXT_STATUS_FLEXBH_FLEX   0x02
+#define I40E_RX_DESC_EXT_STATUS_FLEXBL_MASK   0x03
+#define I40E_RX_DESC_EXT_STATUS_FLEXBL_FLEX   0x01
+
 #undef container_of
 #define container_of(ptr, type, member) ({ \
 		typeof(((type *)0)->member)(*__mptr) = (ptr); \
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 1fc66b781..6ab0bb0d3 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -89,9 +89,131 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 }
 
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+/* SSE version of FDIR mark extraction for 4 32B descriptors at a time */
+static inline __m128i
+descs_to_fdir_32b(volatile union i40e_rx_desc *rxdp, struct rte_mbuf **rx_pkt)
+{
+	/* 32B descriptors: Load 2nd half of descriptors for FDIR ID data */
+	__m128i desc0_qw23, desc1_qw23, desc2_qw23, desc3_qw23;
+	desc0_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 0)->wb.qword2);
+	desc1_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 1)->wb.qword2);
+	desc2_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 2)->wb.qword2);
+	desc3_qw23 = _mm_loadu_si128((__m128i *)&(rxdp + 3)->wb.qword2);
+
+	/* FDIR ID data: move last u32 of each desc to 4 u32 lanes */
+	__m128i v_unpack_01, v_unpack_23;
+	v_unpack_01 = _mm_unpackhi_epi32(desc0_qw23, desc1_qw23);
+	v_unpack_23 = _mm_unpackhi_epi32(desc2_qw23, desc3_qw23);
+	__m128i v_fdir_ids = _mm_unpackhi_epi64(v_unpack_01, v_unpack_23);
+
+	/* Extended Status: extract from each lower 32 bits, to u32 lanes */
+	v_unpack_01 = _mm_unpacklo_epi32(desc0_qw23, desc1_qw23);
+	v_unpack_23 = _mm_unpacklo_epi32(desc2_qw23, desc3_qw23);
+	__m128i v_flt_status = _mm_unpacklo_epi64(v_unpack_01, v_unpack_23);
+
+	/* Shift u32 left and right to "mask away" bits not required.
+	 * Data required is 4:5 (zero based), so left shift by 26 (32-6)
+	 * and then right shift by 30 (32 - 2 bits required).
+	 */
+	v_flt_status = _mm_slli_epi32(v_flt_status, 26);
+	v_flt_status = _mm_srli_epi32(v_flt_status, 30);
+
+	/* Generate constant 1 in all u32 lanes and compare */
+	RTE_BUILD_BUG_ON(I40E_RX_DESC_EXT_STATUS_FLEXBH_FD_ID != 1);
+	__m128i v_zeros = _mm_setzero_si128();
+	__m128i v_ffff = _mm_cmpeq_epi32(v_zeros, v_zeros);
+	__m128i v_u32_one = _mm_srli_epi32(v_ffff, 31);
+
+	/* per desc mask, bits set if FDIR ID is valid */
+	__m128i v_fd_id_mask = _mm_cmpeq_epi32(v_flt_status, v_u32_one);
+
+	/* Mask ID data to zero if the FD_ID bit not set in desc */
+	v_fdir_ids = _mm_and_si128(v_fdir_ids, v_fd_id_mask);
+
+	/* Extract and store as u32. No advantage to combining into SSE
+	 * stores, there are no surrounding stores to around fdir.hi
+	 */
+	rx_pkt[0]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 0);
+	rx_pkt[1]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 1);
+	rx_pkt[2]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 2);
+	rx_pkt[3]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 3);
+
+	/* convert fdir_id_mask into a single bit, then shift as required for
+	 * correct location in the mbuf->olflags
+	 */
+	const uint32_t FDIR_ID_BIT_SHIFT = 13;
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << FDIR_ID_BIT_SHIFT));
+	v_fd_id_mask = _mm_srli_epi32(v_fd_id_mask, 31);
+	v_fd_id_mask = _mm_slli_epi32(v_fd_id_mask, FDIR_ID_BIT_SHIFT);
+
+	/* The returned value must be combined into each mbuf. This is already
+	 * being done for RSS and VLAN mbuf olflags, so return bits to OR in.
+	 */
+	return v_fd_id_mask;
+}
+
+#else /* 32 or 16B FDIR ID handling */
+
+/* Handle 16B descriptor FDIR ID flag setting based on FLM. See scalar driver
+ * for scalar implementation of the same functionality.
+ */
+static inline __m128i
+descs_to_fdir_16b(__m128i fltstat, __m128i descs[4], struct rte_mbuf **rx_pkt)
+{
+	/* unpack filter-status data from descriptors */
+	__m128i v_tmp_01 = _mm_unpacklo_epi32(descs[0], descs[1]);
+	__m128i v_tmp_23 = _mm_unpacklo_epi32(descs[2], descs[3]);
+	__m128i v_fdir_ids = _mm_unpackhi_epi64(v_tmp_01, v_tmp_23);
+
+	/* Generate one bit in each u32 lane */
+	__m128i v_zeros = _mm_setzero_si128();
+	__m128i v_ffff = _mm_cmpeq_epi32(v_zeros, v_zeros);
+	__m128i v_111_mask = _mm_srli_epi32(v_ffff, 29);
+	__m128i v_11_mask = _mm_srli_epi32(v_ffff, 30);
+
+	/* Top lane ones mask for FDIR isolation */
+	__m128i v_desc_fdir_mask = _mm_insert_epi32(v_zeros, UINT32_MAX, 1);
+
+	/* Compare and mask away FDIR ID data if bit not set */
+	__m128i v_u32_bits = _mm_and_si128(v_111_mask, fltstat);
+	__m128i v_fdir_id_mask = _mm_cmpeq_epi32(v_u32_bits, v_11_mask);
+	v_fdir_ids = _mm_and_si128(v_fdir_id_mask, v_fdir_ids);
+
+	/* Store data to fdir.hi in mbuf */
+	rx_pkt[0]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 0);
+	rx_pkt[1]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 1);
+	rx_pkt[2]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 2);
+	rx_pkt[3]->hash.fdir.hi = _mm_extract_epi32(v_fdir_ids, 3);
+
+	/* Move fdir_id_mask to correct lane, blend RSS to zero on hits */
+	__m128i v_desc3_shift = _mm_alignr_epi8(v_zeros, v_fdir_id_mask, 8);
+	__m128i v_desc3_mask = _mm_and_si128(v_desc_fdir_mask, v_desc3_shift);
+	descs[3] = _mm_blendv_epi8(descs[3], _mm_setzero_si128(), v_desc3_mask);
+
+	__m128i v_desc2_shift = _mm_alignr_epi8(v_zeros, v_fdir_id_mask, 4);
+	__m128i v_desc2_mask = _mm_and_si128(v_desc_fdir_mask, v_desc2_shift);
+	descs[2] = _mm_blendv_epi8(descs[2], _mm_setzero_si128(), v_desc2_mask);
+
+	__m128i v_desc1_shift = v_fdir_id_mask;
+	__m128i v_desc1_mask = _mm_and_si128(v_desc_fdir_mask, v_desc1_shift);
+	descs[1] = _mm_blendv_epi8(descs[1], _mm_setzero_si128(), v_desc1_mask);
+
+	__m128i v_desc0_shift = _mm_alignr_epi8(v_fdir_id_mask, v_zeros, 12);
+	__m128i v_desc0_mask = _mm_and_si128(v_desc_fdir_mask, v_desc0_shift);
+	descs[0] = _mm_blendv_epi8(descs[0], _mm_setzero_si128(), v_desc0_mask);
+
+	/* Shift to 1 or 0 bit per u32 lane, then to PKT_RX_FDIR_ID offset */
+	const uint32_t FDIR_ID_BIT_SHIFT = 13;
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << FDIR_ID_BIT_SHIFT));
+	__m128i v_mask_one_bit = _mm_srli_epi32(v_fdir_id_mask, 31);
+	return _mm_slli_epi32(v_mask_one_bit, FDIR_ID_BIT_SHIFT);
+}
+#endif
+
 static inline void
-desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
-	struct rte_mbuf **rx_pkts)
+desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile union i40e_rx_desc *rxdp,
+		  __m128i descs[4], struct rte_mbuf **rx_pkts)
 {
 	const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
 	__m128i rearm0, rearm1, rearm2, rearm3;
@@ -143,6 +265,7 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
 			PKT_RX_IP_CKSUM_BAD >> 1,
 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
 
+	/* Unpack "status" from quadword 1, bits 0:32 */
 	vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
 	vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]);
 	vlan0 = _mm_unpacklo_epi64(vlan0, vlan1);
@@ -150,8 +273,8 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
 	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
 	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);
 
-	rss = _mm_srli_epi32(vlan1, 11);
-	rss = _mm_shuffle_epi8(rss_flags, rss);
+	const __m128i desc_fltstat = _mm_srli_epi32(vlan1, 11);
+	rss = _mm_shuffle_epi8(rss_flags, desc_fltstat);
 
 	l3_l4e = _mm_srli_epi32(vlan1, 22);
 	l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
@@ -163,6 +286,19 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
 	vlan0 = _mm_or_si128(vlan0, rss);
 	vlan0 = _mm_or_si128(vlan0, l3_l4e);
 
+	/* Extract FDIR ID only if FDIR is enabled to avoid useless work */
+	if (rxq->fdir_enabled) {
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+		__m128i v_fdir_ol_flags = descs_to_fdir_32b(rxdp, rx_pkts);
+#else
+		(void)rxdp; /* rxdp not required for 16B desc mode */
+		__m128i v_fdir_ol_flags = descs_to_fdir_16b(desc_fltstat,
+							    descs, rx_pkts);
+#endif
+		/* OR in ol_flag bits after descriptor speicific extraction */
+		vlan0 = _mm_or_si128(vlan0, v_fdir_ol_flags);
+	}
+
 	/*
 	 * At this point, we have the 4 sets of flags in the low 16-bits
 	 * of each 32-bit value in vlan0.
@@ -368,16 +504,16 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		descs[3] = _mm_blend_epi16(descs[3], len3, 0x80);
 		descs[2] = _mm_blend_epi16(descs[2], len2, 0x80);
 
-		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
-		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
-		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
-
 		/* C.1 4=>2 filter staterr info only */
 		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
 		/* C.1 4=>2 filter staterr info only */
 		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
 
-		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
+		desc_to_olflags_v(rxq, rxdp, descs, &rx_pkts[pos]);
+
+		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
+		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
 
 		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
 		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [dpdk-dev] [PATCH v2 3/3] net/i40e: add flow director support to avx rx path
  2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
  2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
  2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 2/3] net/i40e: add flow mark capability to SSE vector routine Harry van Haaren
@ 2019-10-09 15:20   ` Harry van Haaren
  2019-10-10  0:02   ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Zhang, Qi Z
  2019-10-21  6:04   ` Ye Xiaolong
  4 siblings, 0 replies; 11+ messages in thread
From: Harry van Haaren @ 2019-10-09 15:20 UTC (permalink / raw)
  To: dev; +Cc: qi.z.zhang, mesut.a.ergin, Harry van Haaren

This commit adds FDIR ID support to the AVX2 based recieve
path routine. Support for both 16B and 32B descriptors is
implemented.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

v2:
- Fixup AVX2 RSS clearing to not pollute register
---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 193 +++++++++++++++++++++++++-
 1 file changed, 189 insertions(+), 4 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 6f3278960..53c0d7810 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -137,9 +137,90 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 }
 
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+/* Handles 32B descriptor FDIR ID processing:
+ * rxdp: recieve descriptor ring, required to load 2nd 16B half of each desc
+ * rx_pkts: required to store metadata back to mbufs
+ * pkt_idx: offset into the burst, increments in vector widths
+ * desc_idx: required to select the correct shift at compile time
+ */
+static inline __m256i
+desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
+			 struct rte_mbuf **rx_pkts,
+			 const uint32_t pkt_idx,
+			 const uint32_t desc_idx)
+{
+	/* 32B desc path: load rxdp.wb.qword2 for EXT_STATUS and FLEXBH_STAT */
+	__m128i *rxdp_desc_0 = (void *)(&rxdp[desc_idx + 0].wb.qword2);
+	__m128i *rxdp_desc_1 = (void *)(&rxdp[desc_idx + 1].wb.qword2);
+	const __m128i desc_qw2_0 = _mm_load_si128(rxdp_desc_0);
+	const __m128i desc_qw2_1 = _mm_load_si128(rxdp_desc_1);
+
+	/* Mask for FLEXBH_STAT, and the FDIR_ID value to compare against. The
+	 * remaining data is set to all 1's to pass through data.
+	 */
+	const __m256i flexbh_mask = _mm256_set_epi32(-1, -1, -1, 3 << 4,
+						     -1, -1, -1, 3 << 4);
+	const __m256i flexbh_id   = _mm256_set_epi32(-1, -1, -1, 1 << 4,
+						     -1, -1, -1, 1 << 4);
+
+	/* Load descriptor, check for FLEXBH bits, generate a mask for both
+	 * packets in the register.
+	 */
+	__m256i desc_qw2_0_1 =
+		_mm256_inserti128_si256(_mm256_castsi128_si256(desc_qw2_0),
+					desc_qw2_1, 1);
+	__m256i desc_tmp_msk = _mm256_and_si256(flexbh_mask, desc_qw2_0_1);
+	__m256i fdir_mask = _mm256_cmpeq_epi32(flexbh_id, desc_tmp_msk);
+	__m256i fdir_data = _mm256_alignr_epi8(desc_qw2_0_1, desc_qw2_0_1, 12);
+	__m256i desc_fdir_data = _mm256_and_si256(fdir_mask, fdir_data);
+
+	/* Write data out to the mbuf. There is no store to this area of the
+	 * mbuf today, so we cannot combine it with another store.
+	 */
+	const uint32_t idx_0 = pkt_idx + desc_idx;
+	const uint32_t idx_1 = pkt_idx + desc_idx + 1;
+	rx_pkts[idx_0]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 0);
+	rx_pkts[idx_1]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 4);
+
+	/* Create mbuf flags as required for mbuf_flags layout
+	 *  (That's high lane [1,3,5,7, 0,2,4,6] as u32 lanes).
+	 * Approach:
+	 * - Mask away bits not required from the fdir_mask
+	 * - Leave the PKT_FDIR_ID bit (1 << 13)
+	 * - Position that bit correctly based on packet number
+	 * - OR in the resulting bit to mbuf_flags
+	 */
+	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+	__m256i mbuf_flag_mask = _mm256_set_epi32(0, 0, 0, 1 << 13,
+						  0, 0, 0, 1 << 13);
+	__m256i desc_flag_bit =  _mm256_and_si256(mbuf_flag_mask, fdir_mask);
+
+	/* For static-inline function, this will be stripped out
+	 * as the desc_idx is a hard-coded constant.
+	 */
+	switch (desc_idx) {
+	case 0:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  4);
+	case 2:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit,  8);
+	case 4:
+		return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 12);
+	case 6:
+		return desc_flag_bit;
+	default:
+		break;
+	}
+
+	/* NOT REACHED, see above switch returns */
+	return _mm256_setzero_si256();
+}
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
 #define PKTLEN_SHIFT     10
 
-static inline uint16_t
+/* Force inline as some compilers will not inline by default. */
+static __rte_always_inline uint16_t
 _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts, uint8_t *split_packet)
 {
@@ -419,8 +500,10 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		/* set vlan and rss flags */
 		const __m256i vlan_flags = _mm256_shuffle_epi8(
 				vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags = _mm256_shuffle_epi8(
-				rss_flags_shuf, _mm256_srli_epi32(flag_bits, 11));
+		const __m256i rss_fdir_bits = _mm256_srli_epi32(flag_bits, 11);
+		const __m256i rss_flags = _mm256_shuffle_epi8(rss_flags_shuf,
+							      rss_fdir_bits);
+
 		/*
 		 * l3_l4_error flags, shuffle, then shift to correct adjustment
 		 * of flags in flags_shuf, and finally mask out extra bits
@@ -431,8 +514,110 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
 
 		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
+		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
 				_mm256_or_si256(rss_flags, vlan_flags));
+
+		/* If the rxq has FDIR enabled, read and process the FDIR info
+		 * from the descriptor. This can cause more loads/stores, so is
+		 * not always performed. Branch over the code when not enabled.
+		 */
+		if (rxq->fdir_enabled) {
+#ifdef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+			/* 16B descriptor code path:
+			 * RSS and FDIR ID use the same offset in the desc, so
+			 * only one can be present at a time. The code below
+			 * identifies an FDIR ID match, and zeros the RSS value
+			 * in the mbuf on FDIR match to keep mbuf data clean.
+			 */
+
+			/* Flags:
+			 * - Take flags, shift bits to null out
+			 * - CMPEQ with known FDIR ID, to get 0xFFFF or 0 mask
+			 * - Strip bits from mask, leaving 0 or 1 for FDIR ID
+			 * - Merge with mbuf_flags
+			 */
+			/* FLM = 1, FLTSTAT = 0b01, (FLM | FLTSTAT) == 3.
+			 * Shift left by 28 to avoid having to mask.
+			 */
+			const __m256i fdir = _mm256_slli_epi32(rss_fdir_bits, 28);
+			const __m256i fdir_id = _mm256_set1_epi32(3 << 28);
+
+			/* As above, the fdir_mask to packet mapping is this:
+			 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
+			 * Then OR FDIR flags to mbuf_flags on FDIR ID hit.
+			 */
+			RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+			const __m256i pkt_fdir_bit = _mm256_set1_epi32(1 << 13);
+			const __m256i fdir_mask = _mm256_cmpeq_epi32(fdir, fdir_id);
+			__m256i fdir_bits = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_bits);
+
+			/* Based on FDIR_MASK, clear the RSS or FDIR value.
+			 * The FDIR ID value is masked to zero if not a hit,
+			 * otherwise the mb0_1 register RSS field is zeroed.
+			 */
+			const __m256i fdir_zero_mask = _mm256_setzero_si256();
+			const uint32_t fdir_blend_mask = (1 << 3) | (1 << 7);
+			__m256i tmp0_1 = _mm256_blend_epi32(fdir_zero_mask,
+						fdir_mask, fdir_blend_mask);
+			__m256i fdir_mb0_1 = _mm256_and_si256(mb0_1, fdir_mask);
+			mb0_1 = _mm256_andnot_si256(tmp0_1, mb0_1);
+
+			/* Write to mbuf: no stores to combine with, so just a
+			 * scalar store to push data here.
+			 */
+			rx_pkts[i + 0]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb0_1, 3);
+			rx_pkts[i + 1]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb0_1, 7);
+
+			/* Same as above, only shift the fdir_mask to align
+			 * the packet FDIR mask with the FDIR_ID desc lane.
+			 */
+			__m256i tmp2_3 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 12);
+			__m256i fdir_mb2_3 = _mm256_and_si256(mb2_3, tmp2_3);
+			tmp2_3 = _mm256_blend_epi32(fdir_zero_mask, tmp2_3,
+						    fdir_blend_mask);
+			mb2_3 = _mm256_andnot_si256(tmp2_3, mb2_3);
+			rx_pkts[i + 2]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb2_3, 3);
+			rx_pkts[i + 3]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb2_3, 7);
+
+			__m256i tmp4_5 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 8);
+			__m256i fdir_mb4_5 = _mm256_and_si256(mb4_5, tmp4_5);
+			tmp4_5 = _mm256_blend_epi32(fdir_zero_mask, tmp4_5,
+						    fdir_blend_mask);
+			mb4_5 = _mm256_andnot_si256(tmp4_5, mb4_5);
+			rx_pkts[i + 4]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb4_5, 3);
+			rx_pkts[i + 5]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb4_5, 7);
+
+			__m256i tmp6_7 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 4);
+			__m256i fdir_mb6_7 = _mm256_and_si256(mb6_7, tmp6_7);
+			tmp6_7 = _mm256_blend_epi32(fdir_zero_mask, tmp6_7,
+						    fdir_blend_mask);
+			mb6_7 = _mm256_andnot_si256(tmp6_7, mb6_7);
+			rx_pkts[i + 6]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb6_7, 3);
+			rx_pkts[i + 7]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb6_7, 7);
+
+			/* End of 16B descriptor handling */
+#else
+			/* 32B descriptor FDIR ID mark handling. Returns bits
+			 * to be OR-ed into the mbuf olflags.
+			 */
+			__m256i fdir_add_flags;
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 0);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 2);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 4);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+
+			fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 6);
+			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
+			/* End 32B desc handling */
+#endif /* RTE_LIBRTE_I40E_16BYTE_RX_DESC */
+
+		} /* if() on FDIR enabled */
+
 		/*
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
-- 
2.17.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx
  2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
                     ` (2 preceding siblings ...)
  2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 3/3] net/i40e: add flow director support to avx rx path Harry van Haaren
@ 2019-10-10  0:02   ` Zhang, Qi Z
  2019-10-10 16:23     ` Ergin, Mesut A
  2019-10-21  6:04   ` Ye Xiaolong
  4 siblings, 1 reply; 11+ messages in thread
From: Zhang, Qi Z @ 2019-10-10  0:02 UTC (permalink / raw)
  To: Van Haaren, Harry, dev; +Cc: Ergin, Mesut A



> -----Original Message-----
> From: Van Haaren, Harry
> Sent: Wednesday, October 9, 2019 11:20 PM
> To: dev@dpdk.org
> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; Ergin, Mesut A
> <mesut.a.ergin@intel.com>; Van Haaren, Harry
> <harry.van.haaren@intel.com>
> Subject: [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx
> 
> v2:
> - Improve RSS clearning in SSE 16B descriptor
> - Fix AVX 16B descriptor data handling
> 
> ---
> 
> Hey All,
> 
> This series adds FDIR ID support to the vector Rx routines.
> Support for both the SSE and AVX drivers is added, and both the 16 byte and
> 32 byte descriptors are implemented.
> 
> A v1/RFC for this functionality was sent in 19.08 timeframe, refer to the
> patch as archived on patchwork here:
> http://patches.dpdk.org/patch/53969/
> 
> Regards, -Harry
> 
> Harry van Haaren (3):
>   net/i40e: cache fdir enable value in rx queue
>   net/i40e: add flow mark capability to SSE vector routine
>   net/i40e: add flow director support to avx rx path
> 
>  drivers/net/i40e/i40e_ethdev.h        |   1 +
>  drivers/net/i40e/i40e_fdir.c          |  20 +++
>  drivers/net/i40e/i40e_flow.c          |   4 +
>  drivers/net/i40e/i40e_rxtx.c          |   6 -
>  drivers/net/i40e/i40e_rxtx.h          |   8 ++
>  drivers/net/i40e/i40e_rxtx_vec_avx2.c | 193
> +++++++++++++++++++++++++-  drivers/net/i40e/i40e_rxtx_vec_sse.c  |
> 154 ++++++++++++++++++--
>  7 files changed, 367 insertions(+), 19 deletions(-)
> 
> --
> 2.17.1

Thanks for enabling this!
Acked-by: Qi Zhang <qi.z.zhang@intel.com>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx
  2019-10-10  0:02   ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Zhang, Qi Z
@ 2019-10-10 16:23     ` Ergin, Mesut A
  0 siblings, 0 replies; 11+ messages in thread
From: Ergin, Mesut A @ 2019-10-10 16:23 UTC (permalink / raw)
  To: Zhang, Qi Z, Van Haaren, Harry, dev



> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Wednesday, October 9, 2019 5:02 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>; dev@dpdk.org
> Cc: Ergin, Mesut A <mesut.a.ergin@intel.com>
> Subject: RE: [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx
> 
> 
> 
> > -----Original Message-----
> > From: Van Haaren, Harry
> > Sent: Wednesday, October 9, 2019 11:20 PM
> > To: dev@dpdk.org
> > Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; Ergin, Mesut A
> > <mesut.a.ergin@intel.com>; Van Haaren, Harry
> > <harry.van.haaren@intel.com>
> > Subject: [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx
> >
> > v2:
> > - Improve RSS clearning in SSE 16B descriptor
> > - Fix AVX 16B descriptor data handling
> >
> > ---
> >
> > Hey All,
> >
> > This series adds FDIR ID support to the vector Rx routines.
> > Support for both the SSE and AVX drivers is added, and both the 16 byte and
> > 32 byte descriptors are implemented.
> >
> > A v1/RFC for this functionality was sent in 19.08 timeframe, refer to the
> > patch as archived on patchwork here:
> > http://patches.dpdk.org/patch/53969/
> >
> > Regards, -Harry
> >
> > Harry van Haaren (3):
> >   net/i40e: cache fdir enable value in rx queue
> >   net/i40e: add flow mark capability to SSE vector routine
> >   net/i40e: add flow director support to avx rx path
> >
> >  drivers/net/i40e/i40e_ethdev.h        |   1 +
> >  drivers/net/i40e/i40e_fdir.c          |  20 +++
> >  drivers/net/i40e/i40e_flow.c          |   4 +
> >  drivers/net/i40e/i40e_rxtx.c          |   6 -
> >  drivers/net/i40e/i40e_rxtx.h          |   8 ++
> >  drivers/net/i40e/i40e_rxtx_vec_avx2.c | 193
> > +++++++++++++++++++++++++-  drivers/net/i40e/i40e_rxtx_vec_sse.c  |
> > 154 ++++++++++++++++++--
> >  7 files changed, 367 insertions(+), 19 deletions(-)
> >
> > --
> > 2.17.1
> 
> Thanks for enabling this!
> Acked-by: Qi Zhang <qi.z.zhang@intel.com>

This patchset tested out as expected without any regressions on my setup.

Tested-by: Mesut A Ergin <mesut.a.ergin@intel.com>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx
  2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
                     ` (3 preceding siblings ...)
  2019-10-10  0:02   ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Zhang, Qi Z
@ 2019-10-21  6:04   ` Ye Xiaolong
  4 siblings, 0 replies; 11+ messages in thread
From: Ye Xiaolong @ 2019-10-21  6:04 UTC (permalink / raw)
  To: Harry van Haaren; +Cc: dev, qi.z.zhang, mesut.a.ergin

On 10/09, Harry van Haaren wrote:
>v2:
>- Improve RSS clearning in SSE 16B descriptor
>- Fix AVX 16B descriptor data handling
>
>---
>
>Hey All,
>
>This series adds FDIR ID support to the vector Rx routines.
>Support for both the SSE and AVX drivers is added, and both
>the 16 byte and 32 byte descriptors are implemented.
>
>A v1/RFC for this functionality was sent in 19.08 timeframe,
>refer to the patch as archived on patchwork here:
>http://patches.dpdk.org/patch/53969/
>
>Regards, -Harry
>
>Harry van Haaren (3):
>  net/i40e: cache fdir enable value in rx queue
>  net/i40e: add flow mark capability to SSE vector routine
>  net/i40e: add flow director support to avx rx path
>
> drivers/net/i40e/i40e_ethdev.h        |   1 +
> drivers/net/i40e/i40e_fdir.c          |  20 +++
> drivers/net/i40e/i40e_flow.c          |   4 +
> drivers/net/i40e/i40e_rxtx.c          |   6 -
> drivers/net/i40e/i40e_rxtx.h          |   8 ++
> drivers/net/i40e/i40e_rxtx_vec_avx2.c | 193 +++++++++++++++++++++++++-
> drivers/net/i40e/i40e_rxtx_vec_sse.c  | 154 ++++++++++++++++++--
> 7 files changed, 367 insertions(+), 19 deletions(-)
>
>-- 
>2.17.1
>

Series applied to dpdk-next-net-intel.

Thanks,
Xiaolong

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-10-21  6:08 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-07  9:07 [dpdk-dev] [PATCH 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
2019-10-07  9:07 ` [dpdk-dev] [PATCH 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
2019-10-07  9:07 ` [dpdk-dev] [PATCH 2/3] net/i40e: add flow mark capability to SSE vector routine Harry van Haaren
2019-10-07  9:07 ` [dpdk-dev] [PATCH 3/3] net/i40e: add flow director support to avx rx path Harry van Haaren
2019-10-09 15:20 ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Harry van Haaren
2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 1/3] net/i40e: cache fdir enable value in rx queue Harry van Haaren
2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 2/3] net/i40e: add flow mark capability to SSE vector routine Harry van Haaren
2019-10-09 15:20   ` [dpdk-dev] [PATCH v2 3/3] net/i40e: add flow director support to avx rx path Harry van Haaren
2019-10-10  0:02   ` [dpdk-dev] [PATCH v2 0/3] net/i40e: add FDIR ID to vector rx Zhang, Qi Z
2019-10-10 16:23     ` Ergin, Mesut A
2019-10-21  6:04   ` Ye Xiaolong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).