* [dpdk-dev] [PATCH 1/2] net/enic: move common Rx functions to a new header file
@ 2018-09-28 2:16 John Daley
2018-09-28 2:16 ` [dpdk-dev] [PATCH 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
0 siblings, 2 replies; 11+ messages in thread
From: John Daley @ 2018-09-28 2:16 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Move a number of Rx functions to the header file so that the avx2
based Rx handler can use them.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
Reviewed-by: John Daley <johndale@cisco.com>
---
drivers/net/enic/enic_rxtx.c | 263 +---------------------------------
drivers/net/enic/enic_rxtx_common.h | 271 ++++++++++++++++++++++++++++++++++++
2 files changed, 272 insertions(+), 262 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_common.h
diff --git a/drivers/net/enic/enic_rxtx.c b/drivers/net/enic/enic_rxtx.c
index 276a2e559..5189ee635 100644
--- a/drivers/net/enic/enic_rxtx.c
+++ b/drivers/net/enic/enic_rxtx.c
@@ -11,6 +11,7 @@
#include "enic_compat.h"
#include "rq_enet_desc.h"
#include "enic.h"
+#include "enic_rxtx_common.h"
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_tcp.h>
@@ -30,268 +31,6 @@
#define rte_packet_prefetch(p) do {} while (0)
#endif
-static inline uint16_t
-enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
-}
-
-static inline uint16_t
-enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->bytes_written_flags) &
- ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_packet_error(uint16_t bwflags)
-{
- return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
- CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_eop(uint16_t ciflags)
-{
- return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
- == CQ_ENET_RQ_DESC_FLAGS_EOP;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
-{
- return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
-{
- return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
- CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
-}
-
-static inline uint32_t
-enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
-{
- return le32_to_cpu(cqrd->rss_hash);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
-{
- return le16_to_cpu(cqrd->vlan);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- return le16_to_cpu(cqrd->bytes_written_flags) &
- CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-
-static inline uint8_t
-enic_cq_rx_check_err(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags;
-
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
- return 1;
- return 0;
-}
-
-/* Lookup table to translate RX CQ flags to mbuf flags. */
-static inline uint32_t
-enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint8_t cqrd_flags = cqrd->flags;
- /*
- * Odd-numbered entries are for tunnel packets. All packet type info
- * applies to the inner packet, and there is no info on the outer
- * packet. The outer flags in these entries exist only to avoid
- * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
- * afterwards.
- *
- * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
- * RTE_PTYPE_TUNNEL_GRENAT..
- */
- static const uint32_t cq_type_table[128] __rte_cache_aligned = {
- [0x00] = RTE_PTYPE_UNKNOWN,
- [0x01] = RTE_PTYPE_UNKNOWN |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER,
- [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- /* All others reserved */
- };
- cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
- | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
- | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
- return cq_type_table[cqrd_flags + tnl];
-}
-
-static inline void
-enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags, pkt_flags = 0, vlan_tci;
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- vlan_tci = enic_cq_rx_desc_vlan(cqrd);
-
- /* VLAN STRIPPED flag. The L2 packet type updated here also */
- if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
- pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- } else {
- if (vlan_tci != 0) {
- pkt_flags |= PKT_RX_VLAN;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
- } else {
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- }
- }
- mbuf->vlan_tci = vlan_tci;
-
- if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
- struct cq_enet_rq_clsf_desc *clsf_cqd;
- uint16_t filter_id;
- clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
- filter_id = clsf_cqd->filter_id;
- if (filter_id) {
- pkt_flags |= PKT_RX_FDIR;
- if (filter_id != ENIC_MAGIC_FILTER_ID) {
- mbuf->hash.fdir.hi = clsf_cqd->filter_id;
- pkt_flags |= PKT_RX_FDIR_ID;
- }
- }
- } else if (enic_cq_rx_desc_rss_type(cqrd)) {
- /* RSS flag */
- pkt_flags |= PKT_RX_RSS_HASH;
- mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
- }
-
- /* checksum flags */
- if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
- if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
- uint32_t l4_flags;
- l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
-
- /*
- * When overlay offload is enabled, the NIC may
- * set ipv4_csum_ok=1 if the inner packet is IPv6..
- * So, explicitly check for IPv4 before checking
- * ipv4_csum_ok.
- */
- if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
- if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
- pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_IP_CKSUM_BAD;
- }
-
- if (l4_flags == RTE_PTYPE_L4_UDP ||
- l4_flags == RTE_PTYPE_L4_TCP) {
- if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
- pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_L4_CKSUM_BAD;
- }
- }
- }
-
- mbuf->ol_flags = pkt_flags;
-}
-
/* dummy receive function to replace actual function in
* order to do safe reconfiguration operations.
*/
diff --git a/drivers/net/enic/enic_rxtx_common.h b/drivers/net/enic/enic_rxtx_common.h
new file mode 100644
index 000000000..bfbb4909e
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_common.h
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#ifndef _ENIC_RXTX_COMMON_H_
+#define _ENIC_RXTX_COMMON_H_
+
+static inline uint16_t
+enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
+}
+
+static inline uint16_t
+enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->bytes_written_flags) &
+ ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_packet_error(uint16_t bwflags)
+{
+ return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_eop(uint16_t ciflags)
+{
+ return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
+ == CQ_ENET_RQ_DESC_FLAGS_EOP;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
+{
+ return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
+{
+ return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
+ CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
+}
+
+static inline uint32_t
+enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
+{
+ return le32_to_cpu(cqrd->rss_hash);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
+{
+ return le16_to_cpu(cqrd->vlan);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ return le16_to_cpu(cqrd->bytes_written_flags) &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+
+static inline uint8_t
+enic_cq_rx_check_err(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags;
+
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
+ return 1;
+ return 0;
+}
+
+/* Lookup table to translate RX CQ flags to mbuf flags. */
+static uint32_t
+enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint8_t cqrd_flags = cqrd->flags;
+ /*
+ * Odd-numbered entries are for tunnel packets. All packet type info
+ * applies to the inner packet, and there is no info on the outer
+ * packet. The outer flags in these entries exist only to avoid
+ * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
+ * afterwards.
+ *
+ * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
+ * RTE_PTYPE_TUNNEL_GRENAT..
+ */
+ static const uint32_t cq_type_table[128] __rte_cache_aligned = {
+ [0x00] = RTE_PTYPE_UNKNOWN,
+ [0x01] = RTE_PTYPE_UNKNOWN |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER,
+ [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ /* All others reserved */
+ };
+ cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
+ | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
+ | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
+ return cq_type_table[cqrd_flags + tnl];
+}
+
+static void
+enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags, pkt_flags = 0, vlan_tci;
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ vlan_tci = enic_cq_rx_desc_vlan(cqrd);
+
+ /* VLAN STRIPPED flag. The L2 packet type updated here also */
+ if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
+ pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ } else {
+ if (vlan_tci != 0) {
+ pkt_flags |= PKT_RX_VLAN;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ } else {
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ }
+ }
+ mbuf->vlan_tci = vlan_tci;
+
+ if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
+ struct cq_enet_rq_clsf_desc *clsf_cqd;
+ uint16_t filter_id;
+ clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
+ filter_id = clsf_cqd->filter_id;
+ if (filter_id) {
+ pkt_flags |= PKT_RX_FDIR;
+ if (filter_id != ENIC_MAGIC_FILTER_ID) {
+ mbuf->hash.fdir.hi = clsf_cqd->filter_id;
+ pkt_flags |= PKT_RX_FDIR_ID;
+ }
+ }
+ } else if (enic_cq_rx_desc_rss_type(cqrd)) {
+ /* RSS flag */
+ pkt_flags |= PKT_RX_RSS_HASH;
+ mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
+ }
+
+ /* checksum flags */
+ if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
+ if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
+ uint32_t l4_flags;
+ l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
+
+ /*
+ * When overlay offload is enabled, the NIC may
+ * set ipv4_csum_ok=1 if the inner packet is IPv6..
+ * So, explicitly check for IPv4 before checking
+ * ipv4_csum_ok.
+ */
+ if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
+ if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_IP_CKSUM_BAD;
+ }
+
+ if (l4_flags == RTE_PTYPE_L4_UDP ||
+ l4_flags == RTE_PTYPE_L4_TCP) {
+ if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_L4_CKSUM_BAD;
+ }
+ }
+ }
+
+ mbuf->ol_flags = pkt_flags;
+}
+
+#endif /* _ENIC_RXTX_COMMON_H_ */
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [PATCH 2/2] net/enic: add AVX2 based vectorized Rx handler
2018-09-28 2:16 [dpdk-dev] [PATCH 1/2] net/enic: move common Rx functions to a new header file John Daley
@ 2018-09-28 2:16 ` John Daley
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
1 sibling, 0 replies; 11+ messages in thread
From: John Daley @ 2018-09-28 2:16 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Add the vectorized version of the no-scatter Rx handler. It aims to
process 8 descriptors per loop using AVX2 SIMD instructions. This
handler is in its own file enic_rxtx_vec_avx2.c, and makefile and
meson.build are modified to compile it when the compiler supports
AVX2. Under ideal conditions, the vectorized handler reduces
cycles/packet by more than 30%, when compared against the no-scatter
Rx handler. Most implementation ideas come from i40e's AVX2 based
handler, so credit goes to its authors.
At this point, the new handler is meant for field trials, and is not
selected by default. So add a new devarg enable-avx2-rx to allow the
user to request the use of the new handler. When enable-avx2-rx=1, the
driver will consider using the new handler.
Also update the guide doc and introduce the vectorized handler.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
Reviewed-by: John Daley <johndale@cisco.com>
---
doc/guides/nics/enic.rst | 32 ++
drivers/net/enic/Makefile | 7 +
drivers/net/enic/enic.h | 3 +
drivers/net/enic/enic_ethdev.c | 27 +-
drivers/net/enic/enic_main.c | 34 +-
drivers/net/enic/enic_rxtx_vec_avx2.c | 832 ++++++++++++++++++++++++++++++++++
drivers/net/enic/meson.build | 5 +
7 files changed, 931 insertions(+), 9 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_vec_avx2.c
diff --git a/doc/guides/nics/enic.rst b/doc/guides/nics/enic.rst
index 86941fdb2..b31f4eef9 100644
--- a/doc/guides/nics/enic.rst
+++ b/doc/guides/nics/enic.rst
@@ -351,6 +351,38 @@ suitable for others. Such applications may change the mode by setting
applications such as OVS-DPDK performance benchmarks that utilize
only the default VLAN and want to see only untagged packets.
+
+Vectorized Rx Handler
+---------------------
+
+ENIC PMD includes a version of the receive handler that is vectorized using
+AVX2 SIMD instructions. It is meant for bulk, throughput oriented workloads
+where reducing cycles/packet in PMD is a priority. In order to use the
+vectorized handler, take the following steps.
+
+- Use a recent version of gcc, icc, or clang and build 64-bit DPDK. If
+ the compiler is known to support AVX2, DPDK build system
+ automatically compiles the vectorized handler. Otherwise, the
+ handler is not available.
+
+- Set ``devargs`` parameter ``enable-avx2-rx=1`` to explicitly request that
+ PMD consider the vectorized handler when selecting the receive handler.
+
+ As the current implementation is intended for field trials, by default, the
+ vectorized handler is not considerd (``enable-avx2-rx=0``).
+
+- Run on a UCS M4 or later server with CPUs that support AVX2.
+
+PMD selects the vectorized handler when the handler is compiled into
+the driver, the user requests its use via ``enable-avx2-rx=1``, CPU
+supports AVX2, and scatter Rx is not used. To verify that the
+vectorized handler is selected, enable debug logging
+(``--log-level=pmd,debug``) and check the following message.
+
+.. code-block:: console
+
+ enic_use_vector_rx_handler use the non-scatter avx2 Rx handler
+
.. _enic_limitations:
Limitations
diff --git a/drivers/net/enic/Makefile b/drivers/net/enic/Makefile
index 7c6c29cc0..3ec6f9159 100644
--- a/drivers/net/enic/Makefile
+++ b/drivers/net/enic/Makefile
@@ -39,4 +39,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_intr.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rq.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rss.c
+ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2)
+# The current implementation assumes 64-bit pointers
+ifeq ($(CONFIG_RTE_ARCH_X86_64),y)
+ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += enic_rxtx_vec_avx2.c
+endif
+endif
+
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h
index 775cd5d55..ff35f4396 100644
--- a/drivers/net/enic/enic.h
+++ b/drivers/net/enic/enic.h
@@ -106,6 +106,7 @@ struct enic {
struct vnic_dev_bar bar0;
struct vnic_dev *vdev;
+ uint64_t mbuf_initializer;
unsigned int port_id;
bool overlay_offload;
struct rte_eth_dev *rte_dev;
@@ -128,6 +129,7 @@ struct enic {
u8 filter_actions; /* HW supported actions */
bool vxlan;
bool disable_overlay; /* devargs disable_overlay=1 */
+ bool enable_avx2_rx; /* devargs enable-avx2-rx=1 */
bool nic_cfg_chk; /* NIC_CFG_CHK available */
bool udp_rss_weak; /* Bodega style UDP RSS */
uint8_t ig_vlan_rewrite_mode; /* devargs ig-vlan-rewrite */
@@ -329,6 +331,7 @@ uint16_t enic_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
int enic_set_mtu(struct enic *enic, uint16_t new_mtu);
int enic_link_update(struct enic *enic);
+bool enic_use_vector_rx_handler(struct enic *enic);
void enic_fdir_info(struct enic *enic);
void enic_fdir_info_get(struct enic *enic, struct rte_eth_fdir_info *stats);
void copy_fltr_v1(struct filter_v2 *fltr, struct rte_eth_fdir_input *input,
diff --git a/drivers/net/enic/enic_ethdev.c b/drivers/net/enic/enic_ethdev.c
index 65333c47a..4d450fe0c 100644
--- a/drivers/net/enic/enic_ethdev.c
+++ b/drivers/net/enic/enic_ethdev.c
@@ -37,6 +37,7 @@ static const struct rte_pci_id pci_id_enic_map[] = {
};
#define ENIC_DEVARG_DISABLE_OVERLAY "disable-overlay"
+#define ENIC_DEVARG_ENABLE_AVX2_RX "enable-avx2-rx"
#define ENIC_DEVARG_IG_VLAN_REWRITE "ig-vlan-rewrite"
RTE_INIT(enicpmd_init_log)
@@ -915,22 +916,27 @@ static const struct eth_dev_ops enicpmd_eth_dev_ops = {
.udp_tunnel_port_del = enicpmd_dev_udp_tunnel_port_del,
};
-static int enic_parse_disable_overlay(__rte_unused const char *key,
- const char *value,
- void *opaque)
+static int enic_parse_zero_one(const char *key,
+ const char *value,
+ void *opaque)
{
struct enic *enic;
+ bool b;
enic = (struct enic *)opaque;
if (strcmp(value, "0") == 0) {
- enic->disable_overlay = false;
+ b = false;
} else if (strcmp(value, "1") == 0) {
- enic->disable_overlay = true;
+ b = true;
} else {
- dev_err(enic, "Invalid value for " ENIC_DEVARG_DISABLE_OVERLAY
- ": expected=0|1 given=%s\n", value);
+ dev_err(enic, "Invalid value for %s"
+ ": expected=0|1 given=%s\n", key, value);
return -EINVAL;
}
+ if (strcmp(key, ENIC_DEVARG_DISABLE_OVERLAY) == 0)
+ enic->disable_overlay = b;
+ if (strcmp(key, ENIC_DEVARG_ENABLE_AVX2_RX) == 0)
+ enic->enable_avx2_rx = b;
return 0;
}
@@ -971,6 +977,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
{
static const char *const valid_keys[] = {
ENIC_DEVARG_DISABLE_OVERLAY,
+ ENIC_DEVARG_ENABLE_AVX2_RX,
ENIC_DEVARG_IG_VLAN_REWRITE,
NULL};
struct enic *enic = pmd_priv(dev);
@@ -979,6 +986,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
ENICPMD_FUNC_TRACE();
enic->disable_overlay = false;
+ enic->enable_avx2_rx = false;
enic->ig_vlan_rewrite_mode = IG_VLAN_REWRITE_MODE_PASS_THRU;
if (!dev->device->devargs)
return 0;
@@ -986,7 +994,9 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
if (!kvlist)
return -EINVAL;
if (rte_kvargs_process(kvlist, ENIC_DEVARG_DISABLE_OVERLAY,
- enic_parse_disable_overlay, enic) < 0 ||
+ enic_parse_zero_one, enic) < 0 ||
+ rte_kvargs_process(kvlist, ENIC_DEVARG_ENABLE_AVX2_RX,
+ enic_parse_zero_one, enic) < 0 ||
rte_kvargs_process(kvlist, ENIC_DEVARG_IG_VLAN_REWRITE,
enic_parse_ig_vlan_rewrite, enic) < 0) {
rte_kvargs_free(kvlist);
@@ -1055,4 +1065,5 @@ RTE_PMD_REGISTER_PCI_TABLE(net_enic, pci_id_enic_map);
RTE_PMD_REGISTER_KMOD_DEP(net_enic, "* igb_uio | uio_pci_generic | vfio-pci");
RTE_PMD_REGISTER_PARAM_STRING(net_enic,
ENIC_DEVARG_DISABLE_OVERLAY "=0|1 "
+ ENIC_DEVARG_ENABLE_AVX2_RX "=0|1 "
ENIC_DEVARG_IG_VLAN_REWRITE "=trunk|untag|priority|pass");
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index ea6cddbd3..aed73ee1b 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -514,12 +514,29 @@ static void enic_prep_wq_for_simple_tx(struct enic *enic, uint16_t queue_idx)
}
}
+/*
+ * The 'strong' version is in enic_rxtx_vec_avx2.c. This weak version is used
+ * used when that file is not compiled.
+ */
+bool __attribute__((weak))
+enic_use_vector_rx_handler(__rte_unused struct enic *enic)
+{
+ return false;
+}
+
static void pick_rx_handler(struct enic *enic)
{
struct rte_eth_dev *eth_dev;
- /* Use the non-scatter, simplified RX handler if possible. */
+ /*
+ * Preference order:
+ * 1. The vectorized handler if possible and requested.
+ * 2. The non-scatter, simplified handler if scatter Rx is not used.
+ * 3. The default handler as a fallback.
+ */
eth_dev = enic->rte_dev;
+ if (enic_use_vector_rx_handler(enic))
+ return;
if (enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0) {
PMD_INIT_LOG(DEBUG, " use the non-scatter Rx handler");
eth_dev->rx_pkt_burst = &enic_noscatter_recv_pkts;
@@ -535,6 +552,21 @@ int enic_enable(struct enic *enic)
int err;
struct rte_eth_dev *eth_dev = enic->rte_dev;
uint64_t simple_tx_offloads;
+ uintptr_t p;
+ struct rte_mbuf mb_def = { .buf_addr = 0 };
+
+ /*
+ * mbuf_initializer contains const-after-init fields of
+ * receive mbufs (i.e. 64 bits of fields from rearm_data).
+ * It is currently used by the vectorized handler.
+ */
+ mb_def.nb_segs = 1;
+ mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+ mb_def.port = enic->port_id;
+ rte_mbuf_refcnt_set(&mb_def, 1);
+ rte_compiler_barrier();
+ p = (uintptr_t)&mb_def.rearm_data;
+ enic->mbuf_initializer = *(uint64_t *)p;
eth_dev->data->dev_link.link_speed = vnic_dev_port_speed(enic->vdev);
eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
diff --git a/drivers/net/enic/enic_rxtx_vec_avx2.c b/drivers/net/enic/enic_rxtx_vec_avx2.c
new file mode 100644
index 000000000..4891cda1b
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_vec_avx2.c
@@ -0,0 +1,832 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+
+#include "enic_compat.h"
+#include "rq_enet_desc.h"
+#include "enic.h"
+#include "enic_rxtx_common.h"
+
+#include <x86intrin.h>
+
+static struct rte_mbuf *
+rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
+{
+ bool tnl;
+
+ *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
+ mb->data_len = cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+ mb->pkt_len = mb->data_len;
+ tnl = enic->overlay_offload && (cqd->completed_index_flags &
+ CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
+ mb->packet_type =
+ enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
+ enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
+ /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
+ if (tnl) {
+ mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ return mb;
+}
+
+static uint16_t
+enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct rte_mbuf **rx, **rxmb;
+ uint16_t cq_idx, nb_rx, max_rx;
+ struct cq_enet_rq_desc *cqd;
+ struct rq_enet_desc *rqd;
+ struct vnic_cq *cq;
+ struct vnic_rq *rq;
+ struct enic *enic;
+ uint8_t color;
+
+ rq = rx_queue;
+ enic = vnic_dev_priv(rq->vdev);
+ cq = &enic->cq[enic_cq_rq(enic, rq->index)];
+ cq_idx = cq->to_clean;
+
+ /*
+ * Fill up the reserve of free mbufs. Below, we restock the receive
+ * ring with these mbufs to avoid allocation failures.
+ */
+ if (rq->num_free_mbufs == 0) {
+ if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
+ ENIC_RX_BURST_MAX))
+ return 0;
+ rq->num_free_mbufs = ENIC_RX_BURST_MAX;
+ }
+ /* Receive until the end of the ring, at most. */
+ max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
+ max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
+
+ rxmb = rq->mbuf_ring + cq_idx;
+ color = cq->last_color;
+ cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
+ rx = rx_pkts;
+ if (max_rx == 0 ||
+ (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
+ return 0;
+
+ /* Step 1: Process one packet to do aligned 256-bit load below */
+ if (cq_idx & 0x1) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ const __m256i mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0, /* completed_index_flags */
+ /* First descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0 /* completed_index_flags */
+ );
+ const __m256i shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
+ /* First descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80 /* packet_type = 0 */
+ );
+ /* Used to collect 8 flags from 8 desc into one register */
+ const __m256i flags_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /* First descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /*
+ * Byte 3: upper byte of completed_index_flags
+ * bit 5 = fcoe (tunnel)
+ * Byte 2: upper byte of q_number_rss_type_flags
+ * bits 2,3,4,5 = rss type
+ * bit 6 = csum_not_calc
+ * Byte 1: upper byte of bytes_written_flags
+ * bit 6 = truncated
+ * bit 7 = vlan stripped
+ * Byte 0: flags
+ */
+ 1, 3, 9, 14
+ );
+ /* Used to collect 8 VLAN IDs from 8 desc into one register */
+ const __m256i vlan_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ /* First descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10);
+ /* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
+ const __m256i rss_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0, /* rss_types = 0 */
+ /* first 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0 /* rss_types = 0 */);
+ /*
+ * VLAN offload flags.
+ * shuffle index:
+ * vlan_stripped => bit 0
+ * vlan_id == 0 => bit 1
+ */
+ const __m256i vlan_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN);
+ /* Use the same shuffle index as vlan_shuffle */
+ const __m256i vlan_ptype_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER_VLAN);
+ /*
+ * CKSUM flags. Shift right so they fit int 8-bit integers.
+ * shuffle index:
+ * ipv4_csum_ok => bit 3
+ * ip4 => bit 2
+ * tcp_or_udp => bit 1
+ * tcp_udp_csum_ok => bit 0
+ */
+ const __m256i csum_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ /* 1111 ip4+ip4_ok+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 1110 ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 1010 l4+!l4_ok */
+ 0, /* 1001 */
+ 0, /* 1000 */
+ /* 0111 !ip4_ok+ip4+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 0110 !ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0101 !ip4_ok+ip4 */
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0100 !ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 0010 l4+!l4_ok */
+ 0, /* 0001 */
+ 0, /* 0000 */
+ /* first 128 bits */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0,
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0);
+ /*
+ * Non-fragment PTYPEs.
+ * Shuffle 4-bit index:
+ * ip6 => bit 0
+ * ip4 => bit 1
+ * udp => bit 2
+ * tcp => bit 3
+ * bit
+ * 3 2 1 0
+ * -------
+ * 0 0 0 0 unknown
+ * 0 0 0 1 ip6 | nonfrag
+ * 0 0 1 0 ip4 | nonfrag
+ * 0 0 1 1 unknown
+ * 0 1 0 0 unknown
+ * 0 1 0 1 ip6 | udp
+ * 0 1 1 0 ip4 | udp
+ * 0 1 1 1 unknown
+ * 1 0 0 0 unknown
+ * 1 0 0 1 ip6 | tcp
+ * 1 0 1 0 ip4 | tcp
+ * 1 0 1 1 unknown
+ * 1 1 0 0 unknown
+ * 1 1 0 1 unknown
+ * 1 1 1 0 unknown
+ * 1 1 1 1 unknown
+ *
+ * PTYPEs do not fit in 8 bits, so shift right 4..
+ */
+ const __m256i nonfrag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /* Fragment PTYPEs. Use the same shuffle index as above. */
+ const __m256i frag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /*
+ * Tunnel PTYPEs. Use the same shuffle index as above.
+ * L4 types are not part of this table. They come from non-tunnel
+ * types above.
+ */
+ const __m256i tnl_l3_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN);
+
+ const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
+ 0, enic->mbuf_initializer);
+
+ /*
+ * --- cq desc fields --- offset
+ * completed_index_flags - 0 use: fcoe
+ * q_number_rss_type_flags - 2 use: rss types, csum_not_calc
+ * rss_hash - 4 ==> mbuf.hash.rss
+ * bytes_written_flags - 8 ==> mbuf.pkt_len,data_len
+ * use: truncated, vlan_stripped
+ * vlan - 10 ==> mbuf.vlan_tci
+ * checksum_fcoe - 12 (unused)
+ * flags - 14 use: all bits
+ * type_color - 15 (unused)
+ *
+ * --- mbuf fields --- offset
+ * rearm_data ---- 16
+ * data_off - 0 (mbuf_init) -+
+ * refcnt - 2 (mbuf_init) |
+ * nb_segs - 4 (mbuf_init) | 16B 128b
+ * port - 6 (mbuf_init) |
+ * ol_flag - 8 (from cqd) -+
+ * rx_descriptor_fields1 ---- 32
+ * packet_type - 0 (from cqd) -+
+ * pkt_len - 4 (from cqd) |
+ * data_len - 8 (from cqd) | 16B 128b
+ * vlan_tci - 10 (from cqd) |
+ * rss - 12 (from cqd) -+
+ */
+
+ __m256i overlay_enabled =
+ _mm256_set1_epi32((uint32_t)enic->overlay_offload);
+
+ /* Step 2: Process 8 packets per loop using SIMD */
+ while (max_rx > 7 && (((cqd + 7)->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ /* Load 8 16B CQ descriptors */
+ __m256i cqd01 = _mm256_load_si256((void *)cqd);
+ __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
+ __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
+ __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
+ /* Copy 8 mbuf pointers to rx_pkts */
+ _mm256_storeu_si256((void *)rx,
+ _mm256_loadu_si256((void *)rxmb));
+ _mm256_storeu_si256((void *)(rx + 4),
+ _mm256_loadu_si256((void *)(rxmb + 4)));
+
+ /*
+ * Collect 8 flags (each 32 bits) into one register.
+ * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
+ */
+ __m256i flags01 =
+ _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
+ /*
+ * Shuffle above produces 8 x 32-bit flags for 8 descriptors
+ * in this order: 0, 0, 0, 0, 1, 1, 1, 1
+ * The duplicates in each 128-bit lane simplifies blending
+ * below.
+ */
+ __m256i flags23 =
+ _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
+ __m256i flags45 =
+ _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
+ __m256i flags67 =
+ _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
+ /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
+ __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
+ /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
+ __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
+ /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
+ /*
+ * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
+ * This order simplifies blend operations way below that
+ * produce 'rearm' data for each mbuf.
+ */
+ flags0_7 = _mm256_permute4x64_epi64(flags0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Check truncated bits and bail out early on.
+ * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
+ */
+ __m256i trunc =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
+ trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2));
+ /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
+ if (_mm256_extract_epi64(trunc, 0) ||
+ _mm256_extract_epi64(trunc, 1))
+ break;
+
+ /*
+ * Compute PKT_RX_RSS_HASH.
+ * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
+ * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_types =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
+ /*
+ * RSS flags (PKT_RX_RSS_HASH) are in
+ * byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
+
+ /*
+ * Compute CKSUM flags. First build the index and then
+ * use it to shuffle csum_shuffle.
+ * 20 instructions including const loads: 2.5 inst/desc
+ */
+ /*
+ * csum_not_calc (bit 22)
+ * csum_not_calc (0) => 0xffffffff
+ * csum_not_calc (1) => 0x0
+ */
+ const __m256i zero4 = _mm256_setzero_si256();
+ const __m256i mask22 = _mm256_set1_epi32(0x400000);
+ __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
+ _mm256_and_si256(flags0_7, mask22));
+ /*
+ * (tcp|udp) && !fragment => bit 1
+ * tcp = bit 2, udp = bit 1, frag = bit 6
+ */
+ const __m256i mask1 = _mm256_set1_epi32(0x2);
+ __m256i tcp_udp =
+ _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
+ _mm256_or_si256(flags0_7,
+ _mm256_srli_epi32(flags0_7, 1)));
+ tcp_udp = _mm256_and_si256(tcp_udp, mask1);
+ /* ipv4 (bit 5) => bit 2 */
+ const __m256i mask2 = _mm256_set1_epi32(0x4);
+ __m256i ipv4 = _mm256_and_si256(mask2,
+ _mm256_srli_epi32(flags0_7, 3));
+ /*
+ * ipv4_csum_ok (bit 3) => bit 3
+ * tcp_udp_csum_ok (bit 0) => bit 0
+ * 0x9
+ */
+ const __m256i mask0_3 = _mm256_set1_epi32(0x9);
+ __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
+ csum_idx = _mm256_and_si256(csum_not_calc,
+ _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
+ tcp_udp));
+ __m256i csum_flags =
+ _mm256_shuffle_epi8(csum_shuffle, csum_idx);
+ /* Shift left to restore CKSUM flags. See csum_shuffle. */
+ csum_flags = _mm256_slli_epi32(csum_flags, 1);
+ /* Combine csum flags and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, csum_flags);
+
+ /*
+ * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
+ * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
+ * 1.25 inst/desc
+ */
+ __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
+ __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
+ __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
+ __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
+ __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
+ __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
+ /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
+ /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
+ vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /*
+ * Compare 0 == vlan_id produces 0xffffffff (-1) if
+ * vlan 0 and 0 if vlan non-0. Then subtracting the
+ * result from 0 produces 0 - (-1) = 1 for vlan 0, and
+ * 0 - 0 = 0 for vlan non-0.
+ */
+ vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
+ /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
+ vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
+
+ /*
+ * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED.
+ * Use 3 shifts, 1 or, 1 shuffle for 8 desc: 0.625 inst/desc
+ * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i vlan_idx =
+ _mm256_or_si256(/* vlan_stripped => bit 0 */
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
+ 16), 31),
+ /* (vlan_id == 0) => bit 1 */
+ _mm256_slli_epi32(vlan0_7, 1));
+ /*
+ * The index captures 4 cases.
+ * stripped, id = 0 ==> 11b = 3
+ * stripped, id != 0 ==> 01b = 1
+ * not strip, id == 0 ==> 10b = 2
+ * not strip, id != 0 ==> 00b = 0
+ */
+ __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
+ vlan_idx);
+ /* Combine vlan and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
+
+ /*
+ * Compute non-tunnel PTYPEs.
+ * 17 inst / 8 desc = 2.125 inst/desc
+ */
+ /* ETHER and ETHER_VLAN */
+ __m256i vlan_ptype =
+ _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
+ vlan_idx);
+ /* Build the ptype index from flags */
+ tcp_udp = _mm256_slli_epi32(flags0_7, 29);
+ tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
+ __m256i ip4_ip6 =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
+ __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
+ __m256i frag_bit =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
+ __m256i nonfrag_ptype =
+ _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
+ __m256i frag_ptype =
+ _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
+ /*
+ * Zero out the unwanted types and combine the remaining bits.
+ * The effect is same as selecting non-frag or frag types
+ * depending on the frag bit.
+ */
+ nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
+ _mm256_cmpeq_epi32(zero4, frag_bit));
+ frag_ptype = _mm256_and_si256(frag_ptype,
+ _mm256_cmpgt_epi32(frag_bit, zero4));
+ __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
+ ptype = _mm256_slli_epi32(ptype, 4);
+ /*
+ * Compute tunnel PTYPEs.
+ * 15 inst / 8 desc = 1.875 inst/desc
+ */
+ __m256i tnl_l3_ptype =
+ _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
+ tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
+ /*
+ * Shift non-tunnel L4 types to make them tunnel types.
+ * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
+ */
+ __m256i tnl_l4_ptype =
+ _mm256_slli_epi32(_mm256_and_si256(ptype,
+ _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
+ __m256i tnl_ptype =
+ _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
+ tnl_ptype = _mm256_or_si256(tnl_ptype,
+ _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER));
+ /*
+ * Select non-tunnel or tunnel types by zeroing out the
+ * unwanted ones.
+ */
+ __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
+ tnl_ptype = _mm256_and_si256(tnl_ptype,
+ _mm256_sub_epi32(zero4, tnl_flags));
+ ptype = _mm256_and_si256(ptype,
+ _mm256_cmpeq_epi32(zero4, tnl_flags));
+ /*
+ * Combine types and swap to have ptypes in the same order
+ * as desc.
+ * desc: 0 2 4 6 1 3 5 7
+ * 3 inst / 8 desc = 0.375 inst/desc
+ */
+ ptype = _mm256_or_si256(ptype, tnl_ptype);
+ ptype = _mm256_or_si256(ptype, vlan_ptype);
+ ptype = _mm256_permute4x64_epi64(ptype,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Mask packet length.
+ * Use 4 ands: 0.5 instructions/desc
+ */
+ cqd01 = _mm256_and_si256(cqd01, mask);
+ cqd23 = _mm256_and_si256(cqd23, mask);
+ cqd45 = _mm256_and_si256(cqd45, mask);
+ cqd67 = _mm256_and_si256(cqd67, mask);
+ /*
+ * Shuffle. Two 16B sets of the mbuf fields.
+ * packet_type, pkt_len, data_len, vlan_tci, rss
+ */
+ __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
+ __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
+ __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
+ __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
+
+ /*
+ * Blend in ptypes
+ * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
+ */
+ rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
+ rearm23 = _mm256_blend_epi32(rearm23,
+ _mm256_shuffle_epi32(ptype, 1), 0x11);
+ rearm45 = _mm256_blend_epi32(rearm45,
+ _mm256_shuffle_epi32(ptype, 2), 0x11);
+ rearm67 = _mm256_blend_epi32(rearm67,
+ _mm256_shuffle_epi32(ptype, 3), 0x11);
+
+ /*
+ * Move rss_flags into ol_flags in mbuf_init.
+ * Use 1 shift and 1 blend for each desc: 2 inst/desc
+ */
+ __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
+ rss_flags, 0x44);
+ __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 4), 0x44);
+ __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 8), 0x44);
+ __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
+ _mm256_srli_si256(rss_flags, 4), 0x44);
+
+ /*
+ * Build rearm, one per desc.
+ * 8 blends and 4 permutes: 1.5 inst/desc
+ */
+ __m256i rearm0 = _mm256_blend_epi32(rearm01,
+ mbuf_init0_1, 0xf0);
+ __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
+ rearm01, 0xf0);
+ __m256i rearm2 = _mm256_blend_epi32(rearm23,
+ mbuf_init2_3, 0xf0);
+ __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
+ rearm23, 0xf0);
+ /* Swap upper and lower 64 bits */
+ rearm0 = _mm256_permute4x64_epi64(rearm0,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm2 = _mm256_permute4x64_epi64(rearm2,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /* Second set of 4 descriptors */
+ __m256i rearm4 = _mm256_blend_epi32(rearm45,
+ mbuf_init4_5, 0xf0);
+ __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
+ rearm45, 0xf0);
+ __m256i rearm6 = _mm256_blend_epi32(rearm67,
+ mbuf_init6_7, 0xf0);
+ __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
+ rearm67, 0xf0);
+ rearm4 = _mm256_permute4x64_epi64(rearm4,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm6 = _mm256_permute4x64_epi64(rearm6,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Write out 32B of mbuf fields.
+ * data_off - off 0 (mbuf_init)
+ * refcnt - 2 (mbuf_init)
+ * nb_segs - 4 (mbuf_init)
+ * port - 6 (mbuf_init)
+ * ol_flag - 8 (from cqd)
+ * packet_type - 16 (from cqd)
+ * pkt_len - 20 (from cqd)
+ * data_len - 24 (from cqd)
+ * vlan_tci - 26 (from cqd)
+ * rss - 28 (from cqd)
+ */
+ _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
+ _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
+ _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
+ _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
+ _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
+ _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
+ _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
+ _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
+
+ max_rx -= 8;
+ cqd += 8;
+ rx += 8;
+ rxmb += 8;
+ }
+
+ /*
+ * Step 3: Slow path to handle a small (<8) number of packets and
+ * occasional truncated packets.
+ */
+ while (max_rx && ((cqd->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ /* Number of descriptors visited */
+ nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
+ if (nb_rx == 0)
+ return 0;
+ rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
+ rxmb = rq->mbuf_ring + cq_idx;
+ cq_idx += nb_rx;
+ rq->rx_nb_hold += nb_rx;
+ if (unlikely(cq_idx == cq->ring.desc_count)) {
+ cq_idx = 0;
+ cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
+ }
+ cq->to_clean = cq_idx;
+
+ /* Step 4: Restock RQ with new mbufs */
+ memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
+ sizeof(struct rte_mbuf *) * nb_rx);
+ rq->num_free_mbufs -= nb_rx;
+ while (nb_rx) {
+ rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
+ nb_rx--;
+ rqd++;
+ rxmb++;
+ }
+ if (rq->rx_nb_hold > rq->rx_free_thresh) {
+ rq->posted_index = enic_ring_add(rq->ring.desc_count,
+ rq->posted_index,
+ rq->rx_nb_hold);
+ rq->rx_nb_hold = 0;
+ rte_wmb();
+ iowrite32_relaxed(rq->posted_index,
+ &rq->ctrl->posted_index);
+ }
+
+ return rx - rx_pkts;
+}
+
+bool
+enic_use_vector_rx_handler(struct enic *enic)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_fdir_conf *fconf;
+
+ eth_dev = enic->rte_dev;
+ /* User needs to request for the avx2 handler */
+ if (!enic->enable_avx2_rx)
+ return false;
+ /* Do not support scatter Rx */
+ if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
+ return false;
+ /* Do not support fdir/flow */
+ fconf = ð_dev->data->dev_conf.fdir_conf;
+ if (fconf->mode != RTE_FDIR_MODE_NONE)
+ return false;
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+ PMD_INIT_LOG(DEBUG, " use the non-scatter avx2"
+ " Rx handler");
+ eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
+ return true;
+ }
+ return false;
+}
diff --git a/drivers/net/enic/meson.build b/drivers/net/enic/meson.build
index bfd4e2373..5565649c4 100644
--- a/drivers/net/enic/meson.build
+++ b/drivers/net/enic/meson.build
@@ -17,3 +17,8 @@ sources = files(
)
deps += ['hash']
includes += include_directories('base')
+
+# The current implementation assumes 64-bit pointers
+if dpdk_conf.has('RTE_MACHINE_CPUFLAG_AVX2') and cc.sizeof('void *') == 8
+ sources += files('enic_rxtx_vec_avx2.c')
+endif
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file
2018-09-28 2:16 [dpdk-dev] [PATCH 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-09-28 2:16 ` [dpdk-dev] [PATCH 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
@ 2018-09-28 19:20 ` John Daley
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
1 sibling, 2 replies; 11+ messages in thread
From: John Daley @ 2018-09-28 19:20 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Move a number of Rx functions to the header file so that the avx2
based Rx handler can use them.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
---
drivers/net/enic/enic_rxtx.c | 263 +---------------------------------
drivers/net/enic/enic_rxtx_common.h | 271 ++++++++++++++++++++++++++++++++++++
2 files changed, 272 insertions(+), 262 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_common.h
diff --git a/drivers/net/enic/enic_rxtx.c b/drivers/net/enic/enic_rxtx.c
index 276a2e559..5189ee635 100644
--- a/drivers/net/enic/enic_rxtx.c
+++ b/drivers/net/enic/enic_rxtx.c
@@ -11,6 +11,7 @@
#include "enic_compat.h"
#include "rq_enet_desc.h"
#include "enic.h"
+#include "enic_rxtx_common.h"
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_tcp.h>
@@ -30,268 +31,6 @@
#define rte_packet_prefetch(p) do {} while (0)
#endif
-static inline uint16_t
-enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
-}
-
-static inline uint16_t
-enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->bytes_written_flags) &
- ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_packet_error(uint16_t bwflags)
-{
- return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
- CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_eop(uint16_t ciflags)
-{
- return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
- == CQ_ENET_RQ_DESC_FLAGS_EOP;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
-{
- return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
-{
- return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
- CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
-}
-
-static inline uint32_t
-enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
-{
- return le32_to_cpu(cqrd->rss_hash);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
-{
- return le16_to_cpu(cqrd->vlan);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- return le16_to_cpu(cqrd->bytes_written_flags) &
- CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-
-static inline uint8_t
-enic_cq_rx_check_err(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags;
-
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
- return 1;
- return 0;
-}
-
-/* Lookup table to translate RX CQ flags to mbuf flags. */
-static inline uint32_t
-enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint8_t cqrd_flags = cqrd->flags;
- /*
- * Odd-numbered entries are for tunnel packets. All packet type info
- * applies to the inner packet, and there is no info on the outer
- * packet. The outer flags in these entries exist only to avoid
- * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
- * afterwards.
- *
- * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
- * RTE_PTYPE_TUNNEL_GRENAT..
- */
- static const uint32_t cq_type_table[128] __rte_cache_aligned = {
- [0x00] = RTE_PTYPE_UNKNOWN,
- [0x01] = RTE_PTYPE_UNKNOWN |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER,
- [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- /* All others reserved */
- };
- cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
- | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
- | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
- return cq_type_table[cqrd_flags + tnl];
-}
-
-static inline void
-enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags, pkt_flags = 0, vlan_tci;
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- vlan_tci = enic_cq_rx_desc_vlan(cqrd);
-
- /* VLAN STRIPPED flag. The L2 packet type updated here also */
- if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
- pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- } else {
- if (vlan_tci != 0) {
- pkt_flags |= PKT_RX_VLAN;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
- } else {
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- }
- }
- mbuf->vlan_tci = vlan_tci;
-
- if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
- struct cq_enet_rq_clsf_desc *clsf_cqd;
- uint16_t filter_id;
- clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
- filter_id = clsf_cqd->filter_id;
- if (filter_id) {
- pkt_flags |= PKT_RX_FDIR;
- if (filter_id != ENIC_MAGIC_FILTER_ID) {
- mbuf->hash.fdir.hi = clsf_cqd->filter_id;
- pkt_flags |= PKT_RX_FDIR_ID;
- }
- }
- } else if (enic_cq_rx_desc_rss_type(cqrd)) {
- /* RSS flag */
- pkt_flags |= PKT_RX_RSS_HASH;
- mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
- }
-
- /* checksum flags */
- if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
- if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
- uint32_t l4_flags;
- l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
-
- /*
- * When overlay offload is enabled, the NIC may
- * set ipv4_csum_ok=1 if the inner packet is IPv6..
- * So, explicitly check for IPv4 before checking
- * ipv4_csum_ok.
- */
- if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
- if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
- pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_IP_CKSUM_BAD;
- }
-
- if (l4_flags == RTE_PTYPE_L4_UDP ||
- l4_flags == RTE_PTYPE_L4_TCP) {
- if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
- pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_L4_CKSUM_BAD;
- }
- }
- }
-
- mbuf->ol_flags = pkt_flags;
-}
-
/* dummy receive function to replace actual function in
* order to do safe reconfiguration operations.
*/
diff --git a/drivers/net/enic/enic_rxtx_common.h b/drivers/net/enic/enic_rxtx_common.h
new file mode 100644
index 000000000..bfbb4909e
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_common.h
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#ifndef _ENIC_RXTX_COMMON_H_
+#define _ENIC_RXTX_COMMON_H_
+
+static inline uint16_t
+enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
+}
+
+static inline uint16_t
+enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->bytes_written_flags) &
+ ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_packet_error(uint16_t bwflags)
+{
+ return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_eop(uint16_t ciflags)
+{
+ return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
+ == CQ_ENET_RQ_DESC_FLAGS_EOP;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
+{
+ return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
+{
+ return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
+ CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
+}
+
+static inline uint32_t
+enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
+{
+ return le32_to_cpu(cqrd->rss_hash);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
+{
+ return le16_to_cpu(cqrd->vlan);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ return le16_to_cpu(cqrd->bytes_written_flags) &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+
+static inline uint8_t
+enic_cq_rx_check_err(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags;
+
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
+ return 1;
+ return 0;
+}
+
+/* Lookup table to translate RX CQ flags to mbuf flags. */
+static uint32_t
+enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint8_t cqrd_flags = cqrd->flags;
+ /*
+ * Odd-numbered entries are for tunnel packets. All packet type info
+ * applies to the inner packet, and there is no info on the outer
+ * packet. The outer flags in these entries exist only to avoid
+ * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
+ * afterwards.
+ *
+ * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
+ * RTE_PTYPE_TUNNEL_GRENAT..
+ */
+ static const uint32_t cq_type_table[128] __rte_cache_aligned = {
+ [0x00] = RTE_PTYPE_UNKNOWN,
+ [0x01] = RTE_PTYPE_UNKNOWN |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER,
+ [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ /* All others reserved */
+ };
+ cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
+ | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
+ | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
+ return cq_type_table[cqrd_flags + tnl];
+}
+
+static void
+enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags, pkt_flags = 0, vlan_tci;
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ vlan_tci = enic_cq_rx_desc_vlan(cqrd);
+
+ /* VLAN STRIPPED flag. The L2 packet type updated here also */
+ if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
+ pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ } else {
+ if (vlan_tci != 0) {
+ pkt_flags |= PKT_RX_VLAN;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ } else {
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ }
+ }
+ mbuf->vlan_tci = vlan_tci;
+
+ if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
+ struct cq_enet_rq_clsf_desc *clsf_cqd;
+ uint16_t filter_id;
+ clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
+ filter_id = clsf_cqd->filter_id;
+ if (filter_id) {
+ pkt_flags |= PKT_RX_FDIR;
+ if (filter_id != ENIC_MAGIC_FILTER_ID) {
+ mbuf->hash.fdir.hi = clsf_cqd->filter_id;
+ pkt_flags |= PKT_RX_FDIR_ID;
+ }
+ }
+ } else if (enic_cq_rx_desc_rss_type(cqrd)) {
+ /* RSS flag */
+ pkt_flags |= PKT_RX_RSS_HASH;
+ mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
+ }
+
+ /* checksum flags */
+ if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
+ if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
+ uint32_t l4_flags;
+ l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
+
+ /*
+ * When overlay offload is enabled, the NIC may
+ * set ipv4_csum_ok=1 if the inner packet is IPv6..
+ * So, explicitly check for IPv4 before checking
+ * ipv4_csum_ok.
+ */
+ if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
+ if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_IP_CKSUM_BAD;
+ }
+
+ if (l4_flags == RTE_PTYPE_L4_UDP ||
+ l4_flags == RTE_PTYPE_L4_TCP) {
+ if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_L4_CKSUM_BAD;
+ }
+ }
+ }
+
+ mbuf->ol_flags = pkt_flags;
+}
+
+#endif /* _ENIC_RXTX_COMMON_H_ */
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
@ 2018-09-28 19:20 ` John Daley
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
1 sibling, 0 replies; 11+ messages in thread
From: John Daley @ 2018-09-28 19:20 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Add the vectorized version of the no-scatter Rx handler. It aims to
process 8 descriptors per loop using AVX2 SIMD instructions. This
handler is in its own file enic_rxtx_vec_avx2.c, and makefile and
meson.build are modified to compile it when the compiler supports
AVX2. Under ideal conditions, the vectorized handler reduces
cycles/packet by more than 30%, when compared against the no-scatter
Rx handler. Most implementation ideas come from i40e's AVX2 based
handler, so credit goes to its authors.
At this point, the new handler is meant for field trials, and is not
selected by default. So add a new devarg enable-avx2-rx to allow the
user to request the use of the new handler. When enable-avx2-rx=1, the
driver will consider using the new handler.
Also update the guide doc and introduce the vectorized handler.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
---
v2: remove bool type from stucture (found by checkpatch)
doc/guides/nics/enic.rst | 32 ++
drivers/net/enic/Makefile | 7 +
drivers/net/enic/enic.h | 3 +
drivers/net/enic/enic_ethdev.c | 27 +-
drivers/net/enic/enic_main.c | 34 +-
drivers/net/enic/enic_rxtx_vec_avx2.c | 832 ++++++++++++++++++++++++++++++++++
drivers/net/enic/meson.build | 5 +
7 files changed, 931 insertions(+), 9 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_vec_avx2.c
diff --git a/doc/guides/nics/enic.rst b/doc/guides/nics/enic.rst
index 86941fdb2..b31f4eef9 100644
--- a/doc/guides/nics/enic.rst
+++ b/doc/guides/nics/enic.rst
@@ -351,6 +351,38 @@ suitable for others. Such applications may change the mode by setting
applications such as OVS-DPDK performance benchmarks that utilize
only the default VLAN and want to see only untagged packets.
+
+Vectorized Rx Handler
+---------------------
+
+ENIC PMD includes a version of the receive handler that is vectorized using
+AVX2 SIMD instructions. It is meant for bulk, throughput oriented workloads
+where reducing cycles/packet in PMD is a priority. In order to use the
+vectorized handler, take the following steps.
+
+- Use a recent version of gcc, icc, or clang and build 64-bit DPDK. If
+ the compiler is known to support AVX2, DPDK build system
+ automatically compiles the vectorized handler. Otherwise, the
+ handler is not available.
+
+- Set ``devargs`` parameter ``enable-avx2-rx=1`` to explicitly request that
+ PMD consider the vectorized handler when selecting the receive handler.
+
+ As the current implementation is intended for field trials, by default, the
+ vectorized handler is not considerd (``enable-avx2-rx=0``).
+
+- Run on a UCS M4 or later server with CPUs that support AVX2.
+
+PMD selects the vectorized handler when the handler is compiled into
+the driver, the user requests its use via ``enable-avx2-rx=1``, CPU
+supports AVX2, and scatter Rx is not used. To verify that the
+vectorized handler is selected, enable debug logging
+(``--log-level=pmd,debug``) and check the following message.
+
+.. code-block:: console
+
+ enic_use_vector_rx_handler use the non-scatter avx2 Rx handler
+
.. _enic_limitations:
Limitations
diff --git a/drivers/net/enic/Makefile b/drivers/net/enic/Makefile
index 7c6c29cc0..3ec6f9159 100644
--- a/drivers/net/enic/Makefile
+++ b/drivers/net/enic/Makefile
@@ -39,4 +39,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_intr.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rq.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rss.c
+ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2)
+# The current implementation assumes 64-bit pointers
+ifeq ($(CONFIG_RTE_ARCH_X86_64),y)
+ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += enic_rxtx_vec_avx2.c
+endif
+endif
+
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h
index 775cd5d55..665f5668a 100644
--- a/drivers/net/enic/enic.h
+++ b/drivers/net/enic/enic.h
@@ -106,6 +106,7 @@ struct enic {
struct vnic_dev_bar bar0;
struct vnic_dev *vdev;
+ uint64_t mbuf_initializer;
unsigned int port_id;
bool overlay_offload;
struct rte_eth_dev *rte_dev;
@@ -128,6 +129,7 @@ struct enic {
u8 filter_actions; /* HW supported actions */
bool vxlan;
bool disable_overlay; /* devargs disable_overlay=1 */
+ uint8_t enable_avx2_rx; /* devargs enable-avx2-rx=1 */
bool nic_cfg_chk; /* NIC_CFG_CHK available */
bool udp_rss_weak; /* Bodega style UDP RSS */
uint8_t ig_vlan_rewrite_mode; /* devargs ig-vlan-rewrite */
@@ -329,6 +331,7 @@ uint16_t enic_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
int enic_set_mtu(struct enic *enic, uint16_t new_mtu);
int enic_link_update(struct enic *enic);
+bool enic_use_vector_rx_handler(struct enic *enic);
void enic_fdir_info(struct enic *enic);
void enic_fdir_info_get(struct enic *enic, struct rte_eth_fdir_info *stats);
void copy_fltr_v1(struct filter_v2 *fltr, struct rte_eth_fdir_input *input,
diff --git a/drivers/net/enic/enic_ethdev.c b/drivers/net/enic/enic_ethdev.c
index 65333c47a..4d450fe0c 100644
--- a/drivers/net/enic/enic_ethdev.c
+++ b/drivers/net/enic/enic_ethdev.c
@@ -37,6 +37,7 @@ static const struct rte_pci_id pci_id_enic_map[] = {
};
#define ENIC_DEVARG_DISABLE_OVERLAY "disable-overlay"
+#define ENIC_DEVARG_ENABLE_AVX2_RX "enable-avx2-rx"
#define ENIC_DEVARG_IG_VLAN_REWRITE "ig-vlan-rewrite"
RTE_INIT(enicpmd_init_log)
@@ -915,22 +916,27 @@ static const struct eth_dev_ops enicpmd_eth_dev_ops = {
.udp_tunnel_port_del = enicpmd_dev_udp_tunnel_port_del,
};
-static int enic_parse_disable_overlay(__rte_unused const char *key,
- const char *value,
- void *opaque)
+static int enic_parse_zero_one(const char *key,
+ const char *value,
+ void *opaque)
{
struct enic *enic;
+ bool b;
enic = (struct enic *)opaque;
if (strcmp(value, "0") == 0) {
- enic->disable_overlay = false;
+ b = false;
} else if (strcmp(value, "1") == 0) {
- enic->disable_overlay = true;
+ b = true;
} else {
- dev_err(enic, "Invalid value for " ENIC_DEVARG_DISABLE_OVERLAY
- ": expected=0|1 given=%s\n", value);
+ dev_err(enic, "Invalid value for %s"
+ ": expected=0|1 given=%s\n", key, value);
return -EINVAL;
}
+ if (strcmp(key, ENIC_DEVARG_DISABLE_OVERLAY) == 0)
+ enic->disable_overlay = b;
+ if (strcmp(key, ENIC_DEVARG_ENABLE_AVX2_RX) == 0)
+ enic->enable_avx2_rx = b;
return 0;
}
@@ -971,6 +977,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
{
static const char *const valid_keys[] = {
ENIC_DEVARG_DISABLE_OVERLAY,
+ ENIC_DEVARG_ENABLE_AVX2_RX,
ENIC_DEVARG_IG_VLAN_REWRITE,
NULL};
struct enic *enic = pmd_priv(dev);
@@ -979,6 +986,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
ENICPMD_FUNC_TRACE();
enic->disable_overlay = false;
+ enic->enable_avx2_rx = false;
enic->ig_vlan_rewrite_mode = IG_VLAN_REWRITE_MODE_PASS_THRU;
if (!dev->device->devargs)
return 0;
@@ -986,7 +994,9 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
if (!kvlist)
return -EINVAL;
if (rte_kvargs_process(kvlist, ENIC_DEVARG_DISABLE_OVERLAY,
- enic_parse_disable_overlay, enic) < 0 ||
+ enic_parse_zero_one, enic) < 0 ||
+ rte_kvargs_process(kvlist, ENIC_DEVARG_ENABLE_AVX2_RX,
+ enic_parse_zero_one, enic) < 0 ||
rte_kvargs_process(kvlist, ENIC_DEVARG_IG_VLAN_REWRITE,
enic_parse_ig_vlan_rewrite, enic) < 0) {
rte_kvargs_free(kvlist);
@@ -1055,4 +1065,5 @@ RTE_PMD_REGISTER_PCI_TABLE(net_enic, pci_id_enic_map);
RTE_PMD_REGISTER_KMOD_DEP(net_enic, "* igb_uio | uio_pci_generic | vfio-pci");
RTE_PMD_REGISTER_PARAM_STRING(net_enic,
ENIC_DEVARG_DISABLE_OVERLAY "=0|1 "
+ ENIC_DEVARG_ENABLE_AVX2_RX "=0|1 "
ENIC_DEVARG_IG_VLAN_REWRITE "=trunk|untag|priority|pass");
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index ea6cddbd3..aed73ee1b 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -514,12 +514,29 @@ static void enic_prep_wq_for_simple_tx(struct enic *enic, uint16_t queue_idx)
}
}
+/*
+ * The 'strong' version is in enic_rxtx_vec_avx2.c. This weak version is used
+ * used when that file is not compiled.
+ */
+bool __attribute__((weak))
+enic_use_vector_rx_handler(__rte_unused struct enic *enic)
+{
+ return false;
+}
+
static void pick_rx_handler(struct enic *enic)
{
struct rte_eth_dev *eth_dev;
- /* Use the non-scatter, simplified RX handler if possible. */
+ /*
+ * Preference order:
+ * 1. The vectorized handler if possible and requested.
+ * 2. The non-scatter, simplified handler if scatter Rx is not used.
+ * 3. The default handler as a fallback.
+ */
eth_dev = enic->rte_dev;
+ if (enic_use_vector_rx_handler(enic))
+ return;
if (enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0) {
PMD_INIT_LOG(DEBUG, " use the non-scatter Rx handler");
eth_dev->rx_pkt_burst = &enic_noscatter_recv_pkts;
@@ -535,6 +552,21 @@ int enic_enable(struct enic *enic)
int err;
struct rte_eth_dev *eth_dev = enic->rte_dev;
uint64_t simple_tx_offloads;
+ uintptr_t p;
+ struct rte_mbuf mb_def = { .buf_addr = 0 };
+
+ /*
+ * mbuf_initializer contains const-after-init fields of
+ * receive mbufs (i.e. 64 bits of fields from rearm_data).
+ * It is currently used by the vectorized handler.
+ */
+ mb_def.nb_segs = 1;
+ mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+ mb_def.port = enic->port_id;
+ rte_mbuf_refcnt_set(&mb_def, 1);
+ rte_compiler_barrier();
+ p = (uintptr_t)&mb_def.rearm_data;
+ enic->mbuf_initializer = *(uint64_t *)p;
eth_dev->data->dev_link.link_speed = vnic_dev_port_speed(enic->vdev);
eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
diff --git a/drivers/net/enic/enic_rxtx_vec_avx2.c b/drivers/net/enic/enic_rxtx_vec_avx2.c
new file mode 100644
index 000000000..4891cda1b
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_vec_avx2.c
@@ -0,0 +1,832 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+
+#include "enic_compat.h"
+#include "rq_enet_desc.h"
+#include "enic.h"
+#include "enic_rxtx_common.h"
+
+#include <x86intrin.h>
+
+static struct rte_mbuf *
+rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
+{
+ bool tnl;
+
+ *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
+ mb->data_len = cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+ mb->pkt_len = mb->data_len;
+ tnl = enic->overlay_offload && (cqd->completed_index_flags &
+ CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
+ mb->packet_type =
+ enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
+ enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
+ /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
+ if (tnl) {
+ mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ return mb;
+}
+
+static uint16_t
+enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct rte_mbuf **rx, **rxmb;
+ uint16_t cq_idx, nb_rx, max_rx;
+ struct cq_enet_rq_desc *cqd;
+ struct rq_enet_desc *rqd;
+ struct vnic_cq *cq;
+ struct vnic_rq *rq;
+ struct enic *enic;
+ uint8_t color;
+
+ rq = rx_queue;
+ enic = vnic_dev_priv(rq->vdev);
+ cq = &enic->cq[enic_cq_rq(enic, rq->index)];
+ cq_idx = cq->to_clean;
+
+ /*
+ * Fill up the reserve of free mbufs. Below, we restock the receive
+ * ring with these mbufs to avoid allocation failures.
+ */
+ if (rq->num_free_mbufs == 0) {
+ if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
+ ENIC_RX_BURST_MAX))
+ return 0;
+ rq->num_free_mbufs = ENIC_RX_BURST_MAX;
+ }
+ /* Receive until the end of the ring, at most. */
+ max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
+ max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
+
+ rxmb = rq->mbuf_ring + cq_idx;
+ color = cq->last_color;
+ cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
+ rx = rx_pkts;
+ if (max_rx == 0 ||
+ (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
+ return 0;
+
+ /* Step 1: Process one packet to do aligned 256-bit load below */
+ if (cq_idx & 0x1) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ const __m256i mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0, /* completed_index_flags */
+ /* First descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0 /* completed_index_flags */
+ );
+ const __m256i shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
+ /* First descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80 /* packet_type = 0 */
+ );
+ /* Used to collect 8 flags from 8 desc into one register */
+ const __m256i flags_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /* First descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /*
+ * Byte 3: upper byte of completed_index_flags
+ * bit 5 = fcoe (tunnel)
+ * Byte 2: upper byte of q_number_rss_type_flags
+ * bits 2,3,4,5 = rss type
+ * bit 6 = csum_not_calc
+ * Byte 1: upper byte of bytes_written_flags
+ * bit 6 = truncated
+ * bit 7 = vlan stripped
+ * Byte 0: flags
+ */
+ 1, 3, 9, 14
+ );
+ /* Used to collect 8 VLAN IDs from 8 desc into one register */
+ const __m256i vlan_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ /* First descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10);
+ /* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
+ const __m256i rss_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0, /* rss_types = 0 */
+ /* first 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0 /* rss_types = 0 */);
+ /*
+ * VLAN offload flags.
+ * shuffle index:
+ * vlan_stripped => bit 0
+ * vlan_id == 0 => bit 1
+ */
+ const __m256i vlan_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN);
+ /* Use the same shuffle index as vlan_shuffle */
+ const __m256i vlan_ptype_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER_VLAN);
+ /*
+ * CKSUM flags. Shift right so they fit int 8-bit integers.
+ * shuffle index:
+ * ipv4_csum_ok => bit 3
+ * ip4 => bit 2
+ * tcp_or_udp => bit 1
+ * tcp_udp_csum_ok => bit 0
+ */
+ const __m256i csum_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ /* 1111 ip4+ip4_ok+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 1110 ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 1010 l4+!l4_ok */
+ 0, /* 1001 */
+ 0, /* 1000 */
+ /* 0111 !ip4_ok+ip4+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 0110 !ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0101 !ip4_ok+ip4 */
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0100 !ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 0010 l4+!l4_ok */
+ 0, /* 0001 */
+ 0, /* 0000 */
+ /* first 128 bits */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0,
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0);
+ /*
+ * Non-fragment PTYPEs.
+ * Shuffle 4-bit index:
+ * ip6 => bit 0
+ * ip4 => bit 1
+ * udp => bit 2
+ * tcp => bit 3
+ * bit
+ * 3 2 1 0
+ * -------
+ * 0 0 0 0 unknown
+ * 0 0 0 1 ip6 | nonfrag
+ * 0 0 1 0 ip4 | nonfrag
+ * 0 0 1 1 unknown
+ * 0 1 0 0 unknown
+ * 0 1 0 1 ip6 | udp
+ * 0 1 1 0 ip4 | udp
+ * 0 1 1 1 unknown
+ * 1 0 0 0 unknown
+ * 1 0 0 1 ip6 | tcp
+ * 1 0 1 0 ip4 | tcp
+ * 1 0 1 1 unknown
+ * 1 1 0 0 unknown
+ * 1 1 0 1 unknown
+ * 1 1 1 0 unknown
+ * 1 1 1 1 unknown
+ *
+ * PTYPEs do not fit in 8 bits, so shift right 4..
+ */
+ const __m256i nonfrag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /* Fragment PTYPEs. Use the same shuffle index as above. */
+ const __m256i frag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /*
+ * Tunnel PTYPEs. Use the same shuffle index as above.
+ * L4 types are not part of this table. They come from non-tunnel
+ * types above.
+ */
+ const __m256i tnl_l3_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN);
+
+ const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
+ 0, enic->mbuf_initializer);
+
+ /*
+ * --- cq desc fields --- offset
+ * completed_index_flags - 0 use: fcoe
+ * q_number_rss_type_flags - 2 use: rss types, csum_not_calc
+ * rss_hash - 4 ==> mbuf.hash.rss
+ * bytes_written_flags - 8 ==> mbuf.pkt_len,data_len
+ * use: truncated, vlan_stripped
+ * vlan - 10 ==> mbuf.vlan_tci
+ * checksum_fcoe - 12 (unused)
+ * flags - 14 use: all bits
+ * type_color - 15 (unused)
+ *
+ * --- mbuf fields --- offset
+ * rearm_data ---- 16
+ * data_off - 0 (mbuf_init) -+
+ * refcnt - 2 (mbuf_init) |
+ * nb_segs - 4 (mbuf_init) | 16B 128b
+ * port - 6 (mbuf_init) |
+ * ol_flag - 8 (from cqd) -+
+ * rx_descriptor_fields1 ---- 32
+ * packet_type - 0 (from cqd) -+
+ * pkt_len - 4 (from cqd) |
+ * data_len - 8 (from cqd) | 16B 128b
+ * vlan_tci - 10 (from cqd) |
+ * rss - 12 (from cqd) -+
+ */
+
+ __m256i overlay_enabled =
+ _mm256_set1_epi32((uint32_t)enic->overlay_offload);
+
+ /* Step 2: Process 8 packets per loop using SIMD */
+ while (max_rx > 7 && (((cqd + 7)->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ /* Load 8 16B CQ descriptors */
+ __m256i cqd01 = _mm256_load_si256((void *)cqd);
+ __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
+ __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
+ __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
+ /* Copy 8 mbuf pointers to rx_pkts */
+ _mm256_storeu_si256((void *)rx,
+ _mm256_loadu_si256((void *)rxmb));
+ _mm256_storeu_si256((void *)(rx + 4),
+ _mm256_loadu_si256((void *)(rxmb + 4)));
+
+ /*
+ * Collect 8 flags (each 32 bits) into one register.
+ * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
+ */
+ __m256i flags01 =
+ _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
+ /*
+ * Shuffle above produces 8 x 32-bit flags for 8 descriptors
+ * in this order: 0, 0, 0, 0, 1, 1, 1, 1
+ * The duplicates in each 128-bit lane simplifies blending
+ * below.
+ */
+ __m256i flags23 =
+ _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
+ __m256i flags45 =
+ _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
+ __m256i flags67 =
+ _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
+ /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
+ __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
+ /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
+ __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
+ /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
+ /*
+ * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
+ * This order simplifies blend operations way below that
+ * produce 'rearm' data for each mbuf.
+ */
+ flags0_7 = _mm256_permute4x64_epi64(flags0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Check truncated bits and bail out early on.
+ * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
+ */
+ __m256i trunc =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
+ trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2));
+ /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
+ if (_mm256_extract_epi64(trunc, 0) ||
+ _mm256_extract_epi64(trunc, 1))
+ break;
+
+ /*
+ * Compute PKT_RX_RSS_HASH.
+ * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
+ * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_types =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
+ /*
+ * RSS flags (PKT_RX_RSS_HASH) are in
+ * byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
+
+ /*
+ * Compute CKSUM flags. First build the index and then
+ * use it to shuffle csum_shuffle.
+ * 20 instructions including const loads: 2.5 inst/desc
+ */
+ /*
+ * csum_not_calc (bit 22)
+ * csum_not_calc (0) => 0xffffffff
+ * csum_not_calc (1) => 0x0
+ */
+ const __m256i zero4 = _mm256_setzero_si256();
+ const __m256i mask22 = _mm256_set1_epi32(0x400000);
+ __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
+ _mm256_and_si256(flags0_7, mask22));
+ /*
+ * (tcp|udp) && !fragment => bit 1
+ * tcp = bit 2, udp = bit 1, frag = bit 6
+ */
+ const __m256i mask1 = _mm256_set1_epi32(0x2);
+ __m256i tcp_udp =
+ _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
+ _mm256_or_si256(flags0_7,
+ _mm256_srli_epi32(flags0_7, 1)));
+ tcp_udp = _mm256_and_si256(tcp_udp, mask1);
+ /* ipv4 (bit 5) => bit 2 */
+ const __m256i mask2 = _mm256_set1_epi32(0x4);
+ __m256i ipv4 = _mm256_and_si256(mask2,
+ _mm256_srli_epi32(flags0_7, 3));
+ /*
+ * ipv4_csum_ok (bit 3) => bit 3
+ * tcp_udp_csum_ok (bit 0) => bit 0
+ * 0x9
+ */
+ const __m256i mask0_3 = _mm256_set1_epi32(0x9);
+ __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
+ csum_idx = _mm256_and_si256(csum_not_calc,
+ _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
+ tcp_udp));
+ __m256i csum_flags =
+ _mm256_shuffle_epi8(csum_shuffle, csum_idx);
+ /* Shift left to restore CKSUM flags. See csum_shuffle. */
+ csum_flags = _mm256_slli_epi32(csum_flags, 1);
+ /* Combine csum flags and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, csum_flags);
+
+ /*
+ * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
+ * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
+ * 1.25 inst/desc
+ */
+ __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
+ __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
+ __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
+ __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
+ __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
+ __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
+ /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
+ /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
+ vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /*
+ * Compare 0 == vlan_id produces 0xffffffff (-1) if
+ * vlan 0 and 0 if vlan non-0. Then subtracting the
+ * result from 0 produces 0 - (-1) = 1 for vlan 0, and
+ * 0 - 0 = 0 for vlan non-0.
+ */
+ vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
+ /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
+ vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
+
+ /*
+ * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED.
+ * Use 3 shifts, 1 or, 1 shuffle for 8 desc: 0.625 inst/desc
+ * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i vlan_idx =
+ _mm256_or_si256(/* vlan_stripped => bit 0 */
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
+ 16), 31),
+ /* (vlan_id == 0) => bit 1 */
+ _mm256_slli_epi32(vlan0_7, 1));
+ /*
+ * The index captures 4 cases.
+ * stripped, id = 0 ==> 11b = 3
+ * stripped, id != 0 ==> 01b = 1
+ * not strip, id == 0 ==> 10b = 2
+ * not strip, id != 0 ==> 00b = 0
+ */
+ __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
+ vlan_idx);
+ /* Combine vlan and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
+
+ /*
+ * Compute non-tunnel PTYPEs.
+ * 17 inst / 8 desc = 2.125 inst/desc
+ */
+ /* ETHER and ETHER_VLAN */
+ __m256i vlan_ptype =
+ _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
+ vlan_idx);
+ /* Build the ptype index from flags */
+ tcp_udp = _mm256_slli_epi32(flags0_7, 29);
+ tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
+ __m256i ip4_ip6 =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
+ __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
+ __m256i frag_bit =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
+ __m256i nonfrag_ptype =
+ _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
+ __m256i frag_ptype =
+ _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
+ /*
+ * Zero out the unwanted types and combine the remaining bits.
+ * The effect is same as selecting non-frag or frag types
+ * depending on the frag bit.
+ */
+ nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
+ _mm256_cmpeq_epi32(zero4, frag_bit));
+ frag_ptype = _mm256_and_si256(frag_ptype,
+ _mm256_cmpgt_epi32(frag_bit, zero4));
+ __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
+ ptype = _mm256_slli_epi32(ptype, 4);
+ /*
+ * Compute tunnel PTYPEs.
+ * 15 inst / 8 desc = 1.875 inst/desc
+ */
+ __m256i tnl_l3_ptype =
+ _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
+ tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
+ /*
+ * Shift non-tunnel L4 types to make them tunnel types.
+ * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
+ */
+ __m256i tnl_l4_ptype =
+ _mm256_slli_epi32(_mm256_and_si256(ptype,
+ _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
+ __m256i tnl_ptype =
+ _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
+ tnl_ptype = _mm256_or_si256(tnl_ptype,
+ _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER));
+ /*
+ * Select non-tunnel or tunnel types by zeroing out the
+ * unwanted ones.
+ */
+ __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
+ tnl_ptype = _mm256_and_si256(tnl_ptype,
+ _mm256_sub_epi32(zero4, tnl_flags));
+ ptype = _mm256_and_si256(ptype,
+ _mm256_cmpeq_epi32(zero4, tnl_flags));
+ /*
+ * Combine types and swap to have ptypes in the same order
+ * as desc.
+ * desc: 0 2 4 6 1 3 5 7
+ * 3 inst / 8 desc = 0.375 inst/desc
+ */
+ ptype = _mm256_or_si256(ptype, tnl_ptype);
+ ptype = _mm256_or_si256(ptype, vlan_ptype);
+ ptype = _mm256_permute4x64_epi64(ptype,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Mask packet length.
+ * Use 4 ands: 0.5 instructions/desc
+ */
+ cqd01 = _mm256_and_si256(cqd01, mask);
+ cqd23 = _mm256_and_si256(cqd23, mask);
+ cqd45 = _mm256_and_si256(cqd45, mask);
+ cqd67 = _mm256_and_si256(cqd67, mask);
+ /*
+ * Shuffle. Two 16B sets of the mbuf fields.
+ * packet_type, pkt_len, data_len, vlan_tci, rss
+ */
+ __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
+ __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
+ __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
+ __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
+
+ /*
+ * Blend in ptypes
+ * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
+ */
+ rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
+ rearm23 = _mm256_blend_epi32(rearm23,
+ _mm256_shuffle_epi32(ptype, 1), 0x11);
+ rearm45 = _mm256_blend_epi32(rearm45,
+ _mm256_shuffle_epi32(ptype, 2), 0x11);
+ rearm67 = _mm256_blend_epi32(rearm67,
+ _mm256_shuffle_epi32(ptype, 3), 0x11);
+
+ /*
+ * Move rss_flags into ol_flags in mbuf_init.
+ * Use 1 shift and 1 blend for each desc: 2 inst/desc
+ */
+ __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
+ rss_flags, 0x44);
+ __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 4), 0x44);
+ __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 8), 0x44);
+ __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
+ _mm256_srli_si256(rss_flags, 4), 0x44);
+
+ /*
+ * Build rearm, one per desc.
+ * 8 blends and 4 permutes: 1.5 inst/desc
+ */
+ __m256i rearm0 = _mm256_blend_epi32(rearm01,
+ mbuf_init0_1, 0xf0);
+ __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
+ rearm01, 0xf0);
+ __m256i rearm2 = _mm256_blend_epi32(rearm23,
+ mbuf_init2_3, 0xf0);
+ __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
+ rearm23, 0xf0);
+ /* Swap upper and lower 64 bits */
+ rearm0 = _mm256_permute4x64_epi64(rearm0,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm2 = _mm256_permute4x64_epi64(rearm2,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /* Second set of 4 descriptors */
+ __m256i rearm4 = _mm256_blend_epi32(rearm45,
+ mbuf_init4_5, 0xf0);
+ __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
+ rearm45, 0xf0);
+ __m256i rearm6 = _mm256_blend_epi32(rearm67,
+ mbuf_init6_7, 0xf0);
+ __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
+ rearm67, 0xf0);
+ rearm4 = _mm256_permute4x64_epi64(rearm4,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm6 = _mm256_permute4x64_epi64(rearm6,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Write out 32B of mbuf fields.
+ * data_off - off 0 (mbuf_init)
+ * refcnt - 2 (mbuf_init)
+ * nb_segs - 4 (mbuf_init)
+ * port - 6 (mbuf_init)
+ * ol_flag - 8 (from cqd)
+ * packet_type - 16 (from cqd)
+ * pkt_len - 20 (from cqd)
+ * data_len - 24 (from cqd)
+ * vlan_tci - 26 (from cqd)
+ * rss - 28 (from cqd)
+ */
+ _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
+ _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
+ _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
+ _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
+ _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
+ _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
+ _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
+ _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
+
+ max_rx -= 8;
+ cqd += 8;
+ rx += 8;
+ rxmb += 8;
+ }
+
+ /*
+ * Step 3: Slow path to handle a small (<8) number of packets and
+ * occasional truncated packets.
+ */
+ while (max_rx && ((cqd->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ /* Number of descriptors visited */
+ nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
+ if (nb_rx == 0)
+ return 0;
+ rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
+ rxmb = rq->mbuf_ring + cq_idx;
+ cq_idx += nb_rx;
+ rq->rx_nb_hold += nb_rx;
+ if (unlikely(cq_idx == cq->ring.desc_count)) {
+ cq_idx = 0;
+ cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
+ }
+ cq->to_clean = cq_idx;
+
+ /* Step 4: Restock RQ with new mbufs */
+ memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
+ sizeof(struct rte_mbuf *) * nb_rx);
+ rq->num_free_mbufs -= nb_rx;
+ while (nb_rx) {
+ rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
+ nb_rx--;
+ rqd++;
+ rxmb++;
+ }
+ if (rq->rx_nb_hold > rq->rx_free_thresh) {
+ rq->posted_index = enic_ring_add(rq->ring.desc_count,
+ rq->posted_index,
+ rq->rx_nb_hold);
+ rq->rx_nb_hold = 0;
+ rte_wmb();
+ iowrite32_relaxed(rq->posted_index,
+ &rq->ctrl->posted_index);
+ }
+
+ return rx - rx_pkts;
+}
+
+bool
+enic_use_vector_rx_handler(struct enic *enic)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_fdir_conf *fconf;
+
+ eth_dev = enic->rte_dev;
+ /* User needs to request for the avx2 handler */
+ if (!enic->enable_avx2_rx)
+ return false;
+ /* Do not support scatter Rx */
+ if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
+ return false;
+ /* Do not support fdir/flow */
+ fconf = ð_dev->data->dev_conf.fdir_conf;
+ if (fconf->mode != RTE_FDIR_MODE_NONE)
+ return false;
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+ PMD_INIT_LOG(DEBUG, " use the non-scatter avx2"
+ " Rx handler");
+ eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
+ return true;
+ }
+ return false;
+}
diff --git a/drivers/net/enic/meson.build b/drivers/net/enic/meson.build
index bfd4e2373..5565649c4 100644
--- a/drivers/net/enic/meson.build
+++ b/drivers/net/enic/meson.build
@@ -17,3 +17,8 @@ sources = files(
)
deps += ['hash']
includes += include_directories('base')
+
+# The current implementation assumes 64-bit pointers
+if dpdk_conf.has('RTE_MACHINE_CPUFLAG_AVX2') and cc.sizeof('void *') == 8
+ sources += files('enic_rxtx_vec_avx2.c')
+endif
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
@ 2018-09-28 19:25 ` John Daley
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file John Daley
1 sibling, 2 replies; 11+ messages in thread
From: John Daley @ 2018-09-28 19:25 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Move a number of Rx functions to the header file so that the avx2
based Rx handler can use them.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
Reviewed-by: John Daley <johndale@cisco.com>
---
v3: re-add Reviewed-by
drivers/net/enic/enic_rxtx.c | 263 +---------------------------------
drivers/net/enic/enic_rxtx_common.h | 271 ++++++++++++++++++++++++++++++++++++
2 files changed, 272 insertions(+), 262 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_common.h
diff --git a/drivers/net/enic/enic_rxtx.c b/drivers/net/enic/enic_rxtx.c
index 276a2e559..5189ee635 100644
--- a/drivers/net/enic/enic_rxtx.c
+++ b/drivers/net/enic/enic_rxtx.c
@@ -11,6 +11,7 @@
#include "enic_compat.h"
#include "rq_enet_desc.h"
#include "enic.h"
+#include "enic_rxtx_common.h"
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_tcp.h>
@@ -30,268 +31,6 @@
#define rte_packet_prefetch(p) do {} while (0)
#endif
-static inline uint16_t
-enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
-}
-
-static inline uint16_t
-enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->bytes_written_flags) &
- ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_packet_error(uint16_t bwflags)
-{
- return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
- CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_eop(uint16_t ciflags)
-{
- return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
- == CQ_ENET_RQ_DESC_FLAGS_EOP;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
-{
- return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
-{
- return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
- CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
-}
-
-static inline uint32_t
-enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
-{
- return le32_to_cpu(cqrd->rss_hash);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
-{
- return le16_to_cpu(cqrd->vlan);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- return le16_to_cpu(cqrd->bytes_written_flags) &
- CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-
-static inline uint8_t
-enic_cq_rx_check_err(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags;
-
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
- return 1;
- return 0;
-}
-
-/* Lookup table to translate RX CQ flags to mbuf flags. */
-static inline uint32_t
-enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint8_t cqrd_flags = cqrd->flags;
- /*
- * Odd-numbered entries are for tunnel packets. All packet type info
- * applies to the inner packet, and there is no info on the outer
- * packet. The outer flags in these entries exist only to avoid
- * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
- * afterwards.
- *
- * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
- * RTE_PTYPE_TUNNEL_GRENAT..
- */
- static const uint32_t cq_type_table[128] __rte_cache_aligned = {
- [0x00] = RTE_PTYPE_UNKNOWN,
- [0x01] = RTE_PTYPE_UNKNOWN |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER,
- [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- /* All others reserved */
- };
- cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
- | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
- | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
- return cq_type_table[cqrd_flags + tnl];
-}
-
-static inline void
-enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags, pkt_flags = 0, vlan_tci;
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- vlan_tci = enic_cq_rx_desc_vlan(cqrd);
-
- /* VLAN STRIPPED flag. The L2 packet type updated here also */
- if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
- pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- } else {
- if (vlan_tci != 0) {
- pkt_flags |= PKT_RX_VLAN;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
- } else {
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- }
- }
- mbuf->vlan_tci = vlan_tci;
-
- if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
- struct cq_enet_rq_clsf_desc *clsf_cqd;
- uint16_t filter_id;
- clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
- filter_id = clsf_cqd->filter_id;
- if (filter_id) {
- pkt_flags |= PKT_RX_FDIR;
- if (filter_id != ENIC_MAGIC_FILTER_ID) {
- mbuf->hash.fdir.hi = clsf_cqd->filter_id;
- pkt_flags |= PKT_RX_FDIR_ID;
- }
- }
- } else if (enic_cq_rx_desc_rss_type(cqrd)) {
- /* RSS flag */
- pkt_flags |= PKT_RX_RSS_HASH;
- mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
- }
-
- /* checksum flags */
- if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
- if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
- uint32_t l4_flags;
- l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
-
- /*
- * When overlay offload is enabled, the NIC may
- * set ipv4_csum_ok=1 if the inner packet is IPv6..
- * So, explicitly check for IPv4 before checking
- * ipv4_csum_ok.
- */
- if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
- if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
- pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_IP_CKSUM_BAD;
- }
-
- if (l4_flags == RTE_PTYPE_L4_UDP ||
- l4_flags == RTE_PTYPE_L4_TCP) {
- if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
- pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_L4_CKSUM_BAD;
- }
- }
- }
-
- mbuf->ol_flags = pkt_flags;
-}
-
/* dummy receive function to replace actual function in
* order to do safe reconfiguration operations.
*/
diff --git a/drivers/net/enic/enic_rxtx_common.h b/drivers/net/enic/enic_rxtx_common.h
new file mode 100644
index 000000000..bfbb4909e
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_common.h
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#ifndef _ENIC_RXTX_COMMON_H_
+#define _ENIC_RXTX_COMMON_H_
+
+static inline uint16_t
+enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
+}
+
+static inline uint16_t
+enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->bytes_written_flags) &
+ ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_packet_error(uint16_t bwflags)
+{
+ return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_eop(uint16_t ciflags)
+{
+ return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
+ == CQ_ENET_RQ_DESC_FLAGS_EOP;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
+{
+ return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
+{
+ return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
+ CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
+}
+
+static inline uint32_t
+enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
+{
+ return le32_to_cpu(cqrd->rss_hash);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
+{
+ return le16_to_cpu(cqrd->vlan);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ return le16_to_cpu(cqrd->bytes_written_flags) &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+
+static inline uint8_t
+enic_cq_rx_check_err(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags;
+
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
+ return 1;
+ return 0;
+}
+
+/* Lookup table to translate RX CQ flags to mbuf flags. */
+static uint32_t
+enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint8_t cqrd_flags = cqrd->flags;
+ /*
+ * Odd-numbered entries are for tunnel packets. All packet type info
+ * applies to the inner packet, and there is no info on the outer
+ * packet. The outer flags in these entries exist only to avoid
+ * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
+ * afterwards.
+ *
+ * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
+ * RTE_PTYPE_TUNNEL_GRENAT..
+ */
+ static const uint32_t cq_type_table[128] __rte_cache_aligned = {
+ [0x00] = RTE_PTYPE_UNKNOWN,
+ [0x01] = RTE_PTYPE_UNKNOWN |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER,
+ [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ /* All others reserved */
+ };
+ cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
+ | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
+ | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
+ return cq_type_table[cqrd_flags + tnl];
+}
+
+static void
+enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags, pkt_flags = 0, vlan_tci;
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ vlan_tci = enic_cq_rx_desc_vlan(cqrd);
+
+ /* VLAN STRIPPED flag. The L2 packet type updated here also */
+ if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
+ pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ } else {
+ if (vlan_tci != 0) {
+ pkt_flags |= PKT_RX_VLAN;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ } else {
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ }
+ }
+ mbuf->vlan_tci = vlan_tci;
+
+ if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
+ struct cq_enet_rq_clsf_desc *clsf_cqd;
+ uint16_t filter_id;
+ clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
+ filter_id = clsf_cqd->filter_id;
+ if (filter_id) {
+ pkt_flags |= PKT_RX_FDIR;
+ if (filter_id != ENIC_MAGIC_FILTER_ID) {
+ mbuf->hash.fdir.hi = clsf_cqd->filter_id;
+ pkt_flags |= PKT_RX_FDIR_ID;
+ }
+ }
+ } else if (enic_cq_rx_desc_rss_type(cqrd)) {
+ /* RSS flag */
+ pkt_flags |= PKT_RX_RSS_HASH;
+ mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
+ }
+
+ /* checksum flags */
+ if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
+ if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
+ uint32_t l4_flags;
+ l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
+
+ /*
+ * When overlay offload is enabled, the NIC may
+ * set ipv4_csum_ok=1 if the inner packet is IPv6..
+ * So, explicitly check for IPv4 before checking
+ * ipv4_csum_ok.
+ */
+ if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
+ if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_IP_CKSUM_BAD;
+ }
+
+ if (l4_flags == RTE_PTYPE_L4_UDP ||
+ l4_flags == RTE_PTYPE_L4_TCP) {
+ if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_L4_CKSUM_BAD;
+ }
+ }
+ }
+
+ mbuf->ol_flags = pkt_flags;
+}
+
+#endif /* _ENIC_RXTX_COMMON_H_ */
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
@ 2018-09-28 19:25 ` John Daley
2018-10-02 16:08 ` Ferruh Yigit
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file John Daley
1 sibling, 1 reply; 11+ messages in thread
From: John Daley @ 2018-09-28 19:25 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Add the vectorized version of the no-scatter Rx handler. It aims to
process 8 descriptors per loop using AVX2 SIMD instructions. This
handler is in its own file enic_rxtx_vec_avx2.c, and makefile and
meson.build are modified to compile it when the compiler supports
AVX2. Under ideal conditions, the vectorized handler reduces
cycles/packet by more than 30%, when compared against the no-scatter
Rx handler. Most implementation ideas come from i40e's AVX2 based
handler, so credit goes to its authors.
At this point, the new handler is meant for field trials, and is not
selected by default. So add a new devarg enable-avx2-rx to allow the
user to request the use of the new handler. When enable-avx2-rx=1, the
driver will consider using the new handler.
Also update the guide doc and introduce the vectorized handler.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
Reviewed-by: John Daley <johndale@cisco.com>
---
v2: remove bool type from stucture (found by checkpatch)
v3: re-add Reviewed-by
doc/guides/nics/enic.rst | 32 ++
drivers/net/enic/Makefile | 7 +
drivers/net/enic/enic.h | 3 +
drivers/net/enic/enic_ethdev.c | 27 +-
drivers/net/enic/enic_main.c | 34 +-
drivers/net/enic/enic_rxtx_vec_avx2.c | 832 ++++++++++++++++++++++++++++++++++
drivers/net/enic/meson.build | 5 +
7 files changed, 931 insertions(+), 9 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_vec_avx2.c
diff --git a/doc/guides/nics/enic.rst b/doc/guides/nics/enic.rst
index 86941fdb2..b31f4eef9 100644
--- a/doc/guides/nics/enic.rst
+++ b/doc/guides/nics/enic.rst
@@ -351,6 +351,38 @@ suitable for others. Such applications may change the mode by setting
applications such as OVS-DPDK performance benchmarks that utilize
only the default VLAN and want to see only untagged packets.
+
+Vectorized Rx Handler
+---------------------
+
+ENIC PMD includes a version of the receive handler that is vectorized using
+AVX2 SIMD instructions. It is meant for bulk, throughput oriented workloads
+where reducing cycles/packet in PMD is a priority. In order to use the
+vectorized handler, take the following steps.
+
+- Use a recent version of gcc, icc, or clang and build 64-bit DPDK. If
+ the compiler is known to support AVX2, DPDK build system
+ automatically compiles the vectorized handler. Otherwise, the
+ handler is not available.
+
+- Set ``devargs`` parameter ``enable-avx2-rx=1`` to explicitly request that
+ PMD consider the vectorized handler when selecting the receive handler.
+
+ As the current implementation is intended for field trials, by default, the
+ vectorized handler is not considerd (``enable-avx2-rx=0``).
+
+- Run on a UCS M4 or later server with CPUs that support AVX2.
+
+PMD selects the vectorized handler when the handler is compiled into
+the driver, the user requests its use via ``enable-avx2-rx=1``, CPU
+supports AVX2, and scatter Rx is not used. To verify that the
+vectorized handler is selected, enable debug logging
+(``--log-level=pmd,debug``) and check the following message.
+
+.. code-block:: console
+
+ enic_use_vector_rx_handler use the non-scatter avx2 Rx handler
+
.. _enic_limitations:
Limitations
diff --git a/drivers/net/enic/Makefile b/drivers/net/enic/Makefile
index 7c6c29cc0..3ec6f9159 100644
--- a/drivers/net/enic/Makefile
+++ b/drivers/net/enic/Makefile
@@ -39,4 +39,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_intr.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rq.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rss.c
+ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2)
+# The current implementation assumes 64-bit pointers
+ifeq ($(CONFIG_RTE_ARCH_X86_64),y)
+ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += enic_rxtx_vec_avx2.c
+endif
+endif
+
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h
index 775cd5d55..665f5668a 100644
--- a/drivers/net/enic/enic.h
+++ b/drivers/net/enic/enic.h
@@ -106,6 +106,7 @@ struct enic {
struct vnic_dev_bar bar0;
struct vnic_dev *vdev;
+ uint64_t mbuf_initializer;
unsigned int port_id;
bool overlay_offload;
struct rte_eth_dev *rte_dev;
@@ -128,6 +129,7 @@ struct enic {
u8 filter_actions; /* HW supported actions */
bool vxlan;
bool disable_overlay; /* devargs disable_overlay=1 */
+ uint8_t enable_avx2_rx; /* devargs enable-avx2-rx=1 */
bool nic_cfg_chk; /* NIC_CFG_CHK available */
bool udp_rss_weak; /* Bodega style UDP RSS */
uint8_t ig_vlan_rewrite_mode; /* devargs ig-vlan-rewrite */
@@ -329,6 +331,7 @@ uint16_t enic_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
int enic_set_mtu(struct enic *enic, uint16_t new_mtu);
int enic_link_update(struct enic *enic);
+bool enic_use_vector_rx_handler(struct enic *enic);
void enic_fdir_info(struct enic *enic);
void enic_fdir_info_get(struct enic *enic, struct rte_eth_fdir_info *stats);
void copy_fltr_v1(struct filter_v2 *fltr, struct rte_eth_fdir_input *input,
diff --git a/drivers/net/enic/enic_ethdev.c b/drivers/net/enic/enic_ethdev.c
index 65333c47a..4d450fe0c 100644
--- a/drivers/net/enic/enic_ethdev.c
+++ b/drivers/net/enic/enic_ethdev.c
@@ -37,6 +37,7 @@ static const struct rte_pci_id pci_id_enic_map[] = {
};
#define ENIC_DEVARG_DISABLE_OVERLAY "disable-overlay"
+#define ENIC_DEVARG_ENABLE_AVX2_RX "enable-avx2-rx"
#define ENIC_DEVARG_IG_VLAN_REWRITE "ig-vlan-rewrite"
RTE_INIT(enicpmd_init_log)
@@ -915,22 +916,27 @@ static const struct eth_dev_ops enicpmd_eth_dev_ops = {
.udp_tunnel_port_del = enicpmd_dev_udp_tunnel_port_del,
};
-static int enic_parse_disable_overlay(__rte_unused const char *key,
- const char *value,
- void *opaque)
+static int enic_parse_zero_one(const char *key,
+ const char *value,
+ void *opaque)
{
struct enic *enic;
+ bool b;
enic = (struct enic *)opaque;
if (strcmp(value, "0") == 0) {
- enic->disable_overlay = false;
+ b = false;
} else if (strcmp(value, "1") == 0) {
- enic->disable_overlay = true;
+ b = true;
} else {
- dev_err(enic, "Invalid value for " ENIC_DEVARG_DISABLE_OVERLAY
- ": expected=0|1 given=%s\n", value);
+ dev_err(enic, "Invalid value for %s"
+ ": expected=0|1 given=%s\n", key, value);
return -EINVAL;
}
+ if (strcmp(key, ENIC_DEVARG_DISABLE_OVERLAY) == 0)
+ enic->disable_overlay = b;
+ if (strcmp(key, ENIC_DEVARG_ENABLE_AVX2_RX) == 0)
+ enic->enable_avx2_rx = b;
return 0;
}
@@ -971,6 +977,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
{
static const char *const valid_keys[] = {
ENIC_DEVARG_DISABLE_OVERLAY,
+ ENIC_DEVARG_ENABLE_AVX2_RX,
ENIC_DEVARG_IG_VLAN_REWRITE,
NULL};
struct enic *enic = pmd_priv(dev);
@@ -979,6 +986,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
ENICPMD_FUNC_TRACE();
enic->disable_overlay = false;
+ enic->enable_avx2_rx = false;
enic->ig_vlan_rewrite_mode = IG_VLAN_REWRITE_MODE_PASS_THRU;
if (!dev->device->devargs)
return 0;
@@ -986,7 +994,9 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
if (!kvlist)
return -EINVAL;
if (rte_kvargs_process(kvlist, ENIC_DEVARG_DISABLE_OVERLAY,
- enic_parse_disable_overlay, enic) < 0 ||
+ enic_parse_zero_one, enic) < 0 ||
+ rte_kvargs_process(kvlist, ENIC_DEVARG_ENABLE_AVX2_RX,
+ enic_parse_zero_one, enic) < 0 ||
rte_kvargs_process(kvlist, ENIC_DEVARG_IG_VLAN_REWRITE,
enic_parse_ig_vlan_rewrite, enic) < 0) {
rte_kvargs_free(kvlist);
@@ -1055,4 +1065,5 @@ RTE_PMD_REGISTER_PCI_TABLE(net_enic, pci_id_enic_map);
RTE_PMD_REGISTER_KMOD_DEP(net_enic, "* igb_uio | uio_pci_generic | vfio-pci");
RTE_PMD_REGISTER_PARAM_STRING(net_enic,
ENIC_DEVARG_DISABLE_OVERLAY "=0|1 "
+ ENIC_DEVARG_ENABLE_AVX2_RX "=0|1 "
ENIC_DEVARG_IG_VLAN_REWRITE "=trunk|untag|priority|pass");
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index ea6cddbd3..aed73ee1b 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -514,12 +514,29 @@ static void enic_prep_wq_for_simple_tx(struct enic *enic, uint16_t queue_idx)
}
}
+/*
+ * The 'strong' version is in enic_rxtx_vec_avx2.c. This weak version is used
+ * used when that file is not compiled.
+ */
+bool __attribute__((weak))
+enic_use_vector_rx_handler(__rte_unused struct enic *enic)
+{
+ return false;
+}
+
static void pick_rx_handler(struct enic *enic)
{
struct rte_eth_dev *eth_dev;
- /* Use the non-scatter, simplified RX handler if possible. */
+ /*
+ * Preference order:
+ * 1. The vectorized handler if possible and requested.
+ * 2. The non-scatter, simplified handler if scatter Rx is not used.
+ * 3. The default handler as a fallback.
+ */
eth_dev = enic->rte_dev;
+ if (enic_use_vector_rx_handler(enic))
+ return;
if (enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0) {
PMD_INIT_LOG(DEBUG, " use the non-scatter Rx handler");
eth_dev->rx_pkt_burst = &enic_noscatter_recv_pkts;
@@ -535,6 +552,21 @@ int enic_enable(struct enic *enic)
int err;
struct rte_eth_dev *eth_dev = enic->rte_dev;
uint64_t simple_tx_offloads;
+ uintptr_t p;
+ struct rte_mbuf mb_def = { .buf_addr = 0 };
+
+ /*
+ * mbuf_initializer contains const-after-init fields of
+ * receive mbufs (i.e. 64 bits of fields from rearm_data).
+ * It is currently used by the vectorized handler.
+ */
+ mb_def.nb_segs = 1;
+ mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+ mb_def.port = enic->port_id;
+ rte_mbuf_refcnt_set(&mb_def, 1);
+ rte_compiler_barrier();
+ p = (uintptr_t)&mb_def.rearm_data;
+ enic->mbuf_initializer = *(uint64_t *)p;
eth_dev->data->dev_link.link_speed = vnic_dev_port_speed(enic->vdev);
eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
diff --git a/drivers/net/enic/enic_rxtx_vec_avx2.c b/drivers/net/enic/enic_rxtx_vec_avx2.c
new file mode 100644
index 000000000..4891cda1b
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_vec_avx2.c
@@ -0,0 +1,832 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+
+#include "enic_compat.h"
+#include "rq_enet_desc.h"
+#include "enic.h"
+#include "enic_rxtx_common.h"
+
+#include <x86intrin.h>
+
+static struct rte_mbuf *
+rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
+{
+ bool tnl;
+
+ *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
+ mb->data_len = cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+ mb->pkt_len = mb->data_len;
+ tnl = enic->overlay_offload && (cqd->completed_index_flags &
+ CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
+ mb->packet_type =
+ enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
+ enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
+ /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
+ if (tnl) {
+ mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ return mb;
+}
+
+static uint16_t
+enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct rte_mbuf **rx, **rxmb;
+ uint16_t cq_idx, nb_rx, max_rx;
+ struct cq_enet_rq_desc *cqd;
+ struct rq_enet_desc *rqd;
+ struct vnic_cq *cq;
+ struct vnic_rq *rq;
+ struct enic *enic;
+ uint8_t color;
+
+ rq = rx_queue;
+ enic = vnic_dev_priv(rq->vdev);
+ cq = &enic->cq[enic_cq_rq(enic, rq->index)];
+ cq_idx = cq->to_clean;
+
+ /*
+ * Fill up the reserve of free mbufs. Below, we restock the receive
+ * ring with these mbufs to avoid allocation failures.
+ */
+ if (rq->num_free_mbufs == 0) {
+ if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
+ ENIC_RX_BURST_MAX))
+ return 0;
+ rq->num_free_mbufs = ENIC_RX_BURST_MAX;
+ }
+ /* Receive until the end of the ring, at most. */
+ max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
+ max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
+
+ rxmb = rq->mbuf_ring + cq_idx;
+ color = cq->last_color;
+ cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
+ rx = rx_pkts;
+ if (max_rx == 0 ||
+ (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
+ return 0;
+
+ /* Step 1: Process one packet to do aligned 256-bit load below */
+ if (cq_idx & 0x1) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ const __m256i mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0, /* completed_index_flags */
+ /* First descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0 /* completed_index_flags */
+ );
+ const __m256i shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
+ /* First descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80 /* packet_type = 0 */
+ );
+ /* Used to collect 8 flags from 8 desc into one register */
+ const __m256i flags_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /* First descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /*
+ * Byte 3: upper byte of completed_index_flags
+ * bit 5 = fcoe (tunnel)
+ * Byte 2: upper byte of q_number_rss_type_flags
+ * bits 2,3,4,5 = rss type
+ * bit 6 = csum_not_calc
+ * Byte 1: upper byte of bytes_written_flags
+ * bit 6 = truncated
+ * bit 7 = vlan stripped
+ * Byte 0: flags
+ */
+ 1, 3, 9, 14
+ );
+ /* Used to collect 8 VLAN IDs from 8 desc into one register */
+ const __m256i vlan_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ /* First descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10);
+ /* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
+ const __m256i rss_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0, /* rss_types = 0 */
+ /* first 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0 /* rss_types = 0 */);
+ /*
+ * VLAN offload flags.
+ * shuffle index:
+ * vlan_stripped => bit 0
+ * vlan_id == 0 => bit 1
+ */
+ const __m256i vlan_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN);
+ /* Use the same shuffle index as vlan_shuffle */
+ const __m256i vlan_ptype_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER_VLAN);
+ /*
+ * CKSUM flags. Shift right so they fit int 8-bit integers.
+ * shuffle index:
+ * ipv4_csum_ok => bit 3
+ * ip4 => bit 2
+ * tcp_or_udp => bit 1
+ * tcp_udp_csum_ok => bit 0
+ */
+ const __m256i csum_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ /* 1111 ip4+ip4_ok+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 1110 ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 1010 l4+!l4_ok */
+ 0, /* 1001 */
+ 0, /* 1000 */
+ /* 0111 !ip4_ok+ip4+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 0110 !ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0101 !ip4_ok+ip4 */
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0100 !ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 0010 l4+!l4_ok */
+ 0, /* 0001 */
+ 0, /* 0000 */
+ /* first 128 bits */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0,
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0);
+ /*
+ * Non-fragment PTYPEs.
+ * Shuffle 4-bit index:
+ * ip6 => bit 0
+ * ip4 => bit 1
+ * udp => bit 2
+ * tcp => bit 3
+ * bit
+ * 3 2 1 0
+ * -------
+ * 0 0 0 0 unknown
+ * 0 0 0 1 ip6 | nonfrag
+ * 0 0 1 0 ip4 | nonfrag
+ * 0 0 1 1 unknown
+ * 0 1 0 0 unknown
+ * 0 1 0 1 ip6 | udp
+ * 0 1 1 0 ip4 | udp
+ * 0 1 1 1 unknown
+ * 1 0 0 0 unknown
+ * 1 0 0 1 ip6 | tcp
+ * 1 0 1 0 ip4 | tcp
+ * 1 0 1 1 unknown
+ * 1 1 0 0 unknown
+ * 1 1 0 1 unknown
+ * 1 1 1 0 unknown
+ * 1 1 1 1 unknown
+ *
+ * PTYPEs do not fit in 8 bits, so shift right 4..
+ */
+ const __m256i nonfrag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /* Fragment PTYPEs. Use the same shuffle index as above. */
+ const __m256i frag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /*
+ * Tunnel PTYPEs. Use the same shuffle index as above.
+ * L4 types are not part of this table. They come from non-tunnel
+ * types above.
+ */
+ const __m256i tnl_l3_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN);
+
+ const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
+ 0, enic->mbuf_initializer);
+
+ /*
+ * --- cq desc fields --- offset
+ * completed_index_flags - 0 use: fcoe
+ * q_number_rss_type_flags - 2 use: rss types, csum_not_calc
+ * rss_hash - 4 ==> mbuf.hash.rss
+ * bytes_written_flags - 8 ==> mbuf.pkt_len,data_len
+ * use: truncated, vlan_stripped
+ * vlan - 10 ==> mbuf.vlan_tci
+ * checksum_fcoe - 12 (unused)
+ * flags - 14 use: all bits
+ * type_color - 15 (unused)
+ *
+ * --- mbuf fields --- offset
+ * rearm_data ---- 16
+ * data_off - 0 (mbuf_init) -+
+ * refcnt - 2 (mbuf_init) |
+ * nb_segs - 4 (mbuf_init) | 16B 128b
+ * port - 6 (mbuf_init) |
+ * ol_flag - 8 (from cqd) -+
+ * rx_descriptor_fields1 ---- 32
+ * packet_type - 0 (from cqd) -+
+ * pkt_len - 4 (from cqd) |
+ * data_len - 8 (from cqd) | 16B 128b
+ * vlan_tci - 10 (from cqd) |
+ * rss - 12 (from cqd) -+
+ */
+
+ __m256i overlay_enabled =
+ _mm256_set1_epi32((uint32_t)enic->overlay_offload);
+
+ /* Step 2: Process 8 packets per loop using SIMD */
+ while (max_rx > 7 && (((cqd + 7)->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ /* Load 8 16B CQ descriptors */
+ __m256i cqd01 = _mm256_load_si256((void *)cqd);
+ __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
+ __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
+ __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
+ /* Copy 8 mbuf pointers to rx_pkts */
+ _mm256_storeu_si256((void *)rx,
+ _mm256_loadu_si256((void *)rxmb));
+ _mm256_storeu_si256((void *)(rx + 4),
+ _mm256_loadu_si256((void *)(rxmb + 4)));
+
+ /*
+ * Collect 8 flags (each 32 bits) into one register.
+ * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
+ */
+ __m256i flags01 =
+ _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
+ /*
+ * Shuffle above produces 8 x 32-bit flags for 8 descriptors
+ * in this order: 0, 0, 0, 0, 1, 1, 1, 1
+ * The duplicates in each 128-bit lane simplifies blending
+ * below.
+ */
+ __m256i flags23 =
+ _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
+ __m256i flags45 =
+ _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
+ __m256i flags67 =
+ _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
+ /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
+ __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
+ /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
+ __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
+ /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
+ /*
+ * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
+ * This order simplifies blend operations way below that
+ * produce 'rearm' data for each mbuf.
+ */
+ flags0_7 = _mm256_permute4x64_epi64(flags0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Check truncated bits and bail out early on.
+ * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
+ */
+ __m256i trunc =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
+ trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2));
+ /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
+ if (_mm256_extract_epi64(trunc, 0) ||
+ _mm256_extract_epi64(trunc, 1))
+ break;
+
+ /*
+ * Compute PKT_RX_RSS_HASH.
+ * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
+ * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_types =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
+ /*
+ * RSS flags (PKT_RX_RSS_HASH) are in
+ * byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
+
+ /*
+ * Compute CKSUM flags. First build the index and then
+ * use it to shuffle csum_shuffle.
+ * 20 instructions including const loads: 2.5 inst/desc
+ */
+ /*
+ * csum_not_calc (bit 22)
+ * csum_not_calc (0) => 0xffffffff
+ * csum_not_calc (1) => 0x0
+ */
+ const __m256i zero4 = _mm256_setzero_si256();
+ const __m256i mask22 = _mm256_set1_epi32(0x400000);
+ __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
+ _mm256_and_si256(flags0_7, mask22));
+ /*
+ * (tcp|udp) && !fragment => bit 1
+ * tcp = bit 2, udp = bit 1, frag = bit 6
+ */
+ const __m256i mask1 = _mm256_set1_epi32(0x2);
+ __m256i tcp_udp =
+ _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
+ _mm256_or_si256(flags0_7,
+ _mm256_srli_epi32(flags0_7, 1)));
+ tcp_udp = _mm256_and_si256(tcp_udp, mask1);
+ /* ipv4 (bit 5) => bit 2 */
+ const __m256i mask2 = _mm256_set1_epi32(0x4);
+ __m256i ipv4 = _mm256_and_si256(mask2,
+ _mm256_srli_epi32(flags0_7, 3));
+ /*
+ * ipv4_csum_ok (bit 3) => bit 3
+ * tcp_udp_csum_ok (bit 0) => bit 0
+ * 0x9
+ */
+ const __m256i mask0_3 = _mm256_set1_epi32(0x9);
+ __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
+ csum_idx = _mm256_and_si256(csum_not_calc,
+ _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
+ tcp_udp));
+ __m256i csum_flags =
+ _mm256_shuffle_epi8(csum_shuffle, csum_idx);
+ /* Shift left to restore CKSUM flags. See csum_shuffle. */
+ csum_flags = _mm256_slli_epi32(csum_flags, 1);
+ /* Combine csum flags and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, csum_flags);
+
+ /*
+ * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
+ * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
+ * 1.25 inst/desc
+ */
+ __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
+ __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
+ __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
+ __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
+ __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
+ __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
+ /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
+ /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
+ vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /*
+ * Compare 0 == vlan_id produces 0xffffffff (-1) if
+ * vlan 0 and 0 if vlan non-0. Then subtracting the
+ * result from 0 produces 0 - (-1) = 1 for vlan 0, and
+ * 0 - 0 = 0 for vlan non-0.
+ */
+ vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
+ /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
+ vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
+
+ /*
+ * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED.
+ * Use 3 shifts, 1 or, 1 shuffle for 8 desc: 0.625 inst/desc
+ * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i vlan_idx =
+ _mm256_or_si256(/* vlan_stripped => bit 0 */
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
+ 16), 31),
+ /* (vlan_id == 0) => bit 1 */
+ _mm256_slli_epi32(vlan0_7, 1));
+ /*
+ * The index captures 4 cases.
+ * stripped, id = 0 ==> 11b = 3
+ * stripped, id != 0 ==> 01b = 1
+ * not strip, id == 0 ==> 10b = 2
+ * not strip, id != 0 ==> 00b = 0
+ */
+ __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
+ vlan_idx);
+ /* Combine vlan and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
+
+ /*
+ * Compute non-tunnel PTYPEs.
+ * 17 inst / 8 desc = 2.125 inst/desc
+ */
+ /* ETHER and ETHER_VLAN */
+ __m256i vlan_ptype =
+ _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
+ vlan_idx);
+ /* Build the ptype index from flags */
+ tcp_udp = _mm256_slli_epi32(flags0_7, 29);
+ tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
+ __m256i ip4_ip6 =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
+ __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
+ __m256i frag_bit =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
+ __m256i nonfrag_ptype =
+ _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
+ __m256i frag_ptype =
+ _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
+ /*
+ * Zero out the unwanted types and combine the remaining bits.
+ * The effect is same as selecting non-frag or frag types
+ * depending on the frag bit.
+ */
+ nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
+ _mm256_cmpeq_epi32(zero4, frag_bit));
+ frag_ptype = _mm256_and_si256(frag_ptype,
+ _mm256_cmpgt_epi32(frag_bit, zero4));
+ __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
+ ptype = _mm256_slli_epi32(ptype, 4);
+ /*
+ * Compute tunnel PTYPEs.
+ * 15 inst / 8 desc = 1.875 inst/desc
+ */
+ __m256i tnl_l3_ptype =
+ _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
+ tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
+ /*
+ * Shift non-tunnel L4 types to make them tunnel types.
+ * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
+ */
+ __m256i tnl_l4_ptype =
+ _mm256_slli_epi32(_mm256_and_si256(ptype,
+ _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
+ __m256i tnl_ptype =
+ _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
+ tnl_ptype = _mm256_or_si256(tnl_ptype,
+ _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER));
+ /*
+ * Select non-tunnel or tunnel types by zeroing out the
+ * unwanted ones.
+ */
+ __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
+ tnl_ptype = _mm256_and_si256(tnl_ptype,
+ _mm256_sub_epi32(zero4, tnl_flags));
+ ptype = _mm256_and_si256(ptype,
+ _mm256_cmpeq_epi32(zero4, tnl_flags));
+ /*
+ * Combine types and swap to have ptypes in the same order
+ * as desc.
+ * desc: 0 2 4 6 1 3 5 7
+ * 3 inst / 8 desc = 0.375 inst/desc
+ */
+ ptype = _mm256_or_si256(ptype, tnl_ptype);
+ ptype = _mm256_or_si256(ptype, vlan_ptype);
+ ptype = _mm256_permute4x64_epi64(ptype,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Mask packet length.
+ * Use 4 ands: 0.5 instructions/desc
+ */
+ cqd01 = _mm256_and_si256(cqd01, mask);
+ cqd23 = _mm256_and_si256(cqd23, mask);
+ cqd45 = _mm256_and_si256(cqd45, mask);
+ cqd67 = _mm256_and_si256(cqd67, mask);
+ /*
+ * Shuffle. Two 16B sets of the mbuf fields.
+ * packet_type, pkt_len, data_len, vlan_tci, rss
+ */
+ __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
+ __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
+ __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
+ __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
+
+ /*
+ * Blend in ptypes
+ * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
+ */
+ rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
+ rearm23 = _mm256_blend_epi32(rearm23,
+ _mm256_shuffle_epi32(ptype, 1), 0x11);
+ rearm45 = _mm256_blend_epi32(rearm45,
+ _mm256_shuffle_epi32(ptype, 2), 0x11);
+ rearm67 = _mm256_blend_epi32(rearm67,
+ _mm256_shuffle_epi32(ptype, 3), 0x11);
+
+ /*
+ * Move rss_flags into ol_flags in mbuf_init.
+ * Use 1 shift and 1 blend for each desc: 2 inst/desc
+ */
+ __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
+ rss_flags, 0x44);
+ __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 4), 0x44);
+ __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 8), 0x44);
+ __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
+ _mm256_srli_si256(rss_flags, 4), 0x44);
+
+ /*
+ * Build rearm, one per desc.
+ * 8 blends and 4 permutes: 1.5 inst/desc
+ */
+ __m256i rearm0 = _mm256_blend_epi32(rearm01,
+ mbuf_init0_1, 0xf0);
+ __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
+ rearm01, 0xf0);
+ __m256i rearm2 = _mm256_blend_epi32(rearm23,
+ mbuf_init2_3, 0xf0);
+ __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
+ rearm23, 0xf0);
+ /* Swap upper and lower 64 bits */
+ rearm0 = _mm256_permute4x64_epi64(rearm0,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm2 = _mm256_permute4x64_epi64(rearm2,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /* Second set of 4 descriptors */
+ __m256i rearm4 = _mm256_blend_epi32(rearm45,
+ mbuf_init4_5, 0xf0);
+ __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
+ rearm45, 0xf0);
+ __m256i rearm6 = _mm256_blend_epi32(rearm67,
+ mbuf_init6_7, 0xf0);
+ __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
+ rearm67, 0xf0);
+ rearm4 = _mm256_permute4x64_epi64(rearm4,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm6 = _mm256_permute4x64_epi64(rearm6,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Write out 32B of mbuf fields.
+ * data_off - off 0 (mbuf_init)
+ * refcnt - 2 (mbuf_init)
+ * nb_segs - 4 (mbuf_init)
+ * port - 6 (mbuf_init)
+ * ol_flag - 8 (from cqd)
+ * packet_type - 16 (from cqd)
+ * pkt_len - 20 (from cqd)
+ * data_len - 24 (from cqd)
+ * vlan_tci - 26 (from cqd)
+ * rss - 28 (from cqd)
+ */
+ _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
+ _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
+ _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
+ _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
+ _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
+ _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
+ _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
+ _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
+
+ max_rx -= 8;
+ cqd += 8;
+ rx += 8;
+ rxmb += 8;
+ }
+
+ /*
+ * Step 3: Slow path to handle a small (<8) number of packets and
+ * occasional truncated packets.
+ */
+ while (max_rx && ((cqd->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ /* Number of descriptors visited */
+ nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
+ if (nb_rx == 0)
+ return 0;
+ rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
+ rxmb = rq->mbuf_ring + cq_idx;
+ cq_idx += nb_rx;
+ rq->rx_nb_hold += nb_rx;
+ if (unlikely(cq_idx == cq->ring.desc_count)) {
+ cq_idx = 0;
+ cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
+ }
+ cq->to_clean = cq_idx;
+
+ /* Step 4: Restock RQ with new mbufs */
+ memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
+ sizeof(struct rte_mbuf *) * nb_rx);
+ rq->num_free_mbufs -= nb_rx;
+ while (nb_rx) {
+ rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
+ nb_rx--;
+ rqd++;
+ rxmb++;
+ }
+ if (rq->rx_nb_hold > rq->rx_free_thresh) {
+ rq->posted_index = enic_ring_add(rq->ring.desc_count,
+ rq->posted_index,
+ rq->rx_nb_hold);
+ rq->rx_nb_hold = 0;
+ rte_wmb();
+ iowrite32_relaxed(rq->posted_index,
+ &rq->ctrl->posted_index);
+ }
+
+ return rx - rx_pkts;
+}
+
+bool
+enic_use_vector_rx_handler(struct enic *enic)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_fdir_conf *fconf;
+
+ eth_dev = enic->rte_dev;
+ /* User needs to request for the avx2 handler */
+ if (!enic->enable_avx2_rx)
+ return false;
+ /* Do not support scatter Rx */
+ if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
+ return false;
+ /* Do not support fdir/flow */
+ fconf = ð_dev->data->dev_conf.fdir_conf;
+ if (fconf->mode != RTE_FDIR_MODE_NONE)
+ return false;
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+ PMD_INIT_LOG(DEBUG, " use the non-scatter avx2"
+ " Rx handler");
+ eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
+ return true;
+ }
+ return false;
+}
diff --git a/drivers/net/enic/meson.build b/drivers/net/enic/meson.build
index bfd4e2373..5565649c4 100644
--- a/drivers/net/enic/meson.build
+++ b/drivers/net/enic/meson.build
@@ -17,3 +17,8 @@ sources = files(
)
deps += ['hash']
includes += include_directories('base')
+
+# The current implementation assumes 64-bit pointers
+if dpdk_conf.has('RTE_MACHINE_CPUFLAG_AVX2') and cc.sizeof('void *') == 8
+ sources += files('enic_rxtx_vec_avx2.c')
+endif
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
@ 2018-10-02 16:08 ` Ferruh Yigit
2018-10-03 13:00 ` Hyong Youb Kim
0 siblings, 1 reply; 11+ messages in thread
From: Ferruh Yigit @ 2018-10-02 16:08 UTC (permalink / raw)
To: John Daley; +Cc: dev, Hyong Youb Kim
On 9/28/2018 8:25 PM, John Daley wrote:
> From: Hyong Youb Kim <hyonkim@cisco.com>
>
> Add the vectorized version of the no-scatter Rx handler. It aims to
> process 8 descriptors per loop using AVX2 SIMD instructions. This
> handler is in its own file enic_rxtx_vec_avx2.c, and makefile and
> meson.build are modified to compile it when the compiler supports
> AVX2. Under ideal conditions, the vectorized handler reduces
> cycles/packet by more than 30%, when compared against the no-scatter
> Rx handler. Most implementation ideas come from i40e's AVX2 based
> handler, so credit goes to its authors.
>
> At this point, the new handler is meant for field trials, and is not
> selected by default. So add a new devarg enable-avx2-rx to allow the
> user to request the use of the new handler. When enable-avx2-rx=1, the
> driver will consider using the new handler.
>
> Also update the guide doc and introduce the vectorized handler.
>
> Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
> Reviewed-by: John Daley <johndale@cisco.com>
<...>
> +Vectorized Rx Handler
> +---------------------
> +
> +ENIC PMD includes a version of the receive handler that is vectorized using
> +AVX2 SIMD instructions. It is meant for bulk, throughput oriented workloads
> +where reducing cycles/packet in PMD is a priority. In order to use the
> +vectorized handler, take the following steps.
> +
> +- Use a recent version of gcc, icc, or clang and build 64-bit DPDK. If
> + the compiler is known to support AVX2, DPDK build system
> + automatically compiles the vectorized handler. Otherwise, the
> + handler is not available.
> +
> +- Set ``devargs`` parameter ``enable-avx2-rx=1`` to explicitly request that
> + PMD consider the vectorized handler when selecting the receive handler.
Can be good to show a usage sample, as done in disable-overlay=1, like
-w 12:00.0,enable-avx2-rx=1
> +
> + As the current implementation is intended for field trials, by default, the
> + vectorized handler is not considerd (``enable-avx2-rx=0``).
> +
> +- Run on a UCS M4 or later server with CPUs that support AVX2.
> +
> +PMD selects the vectorized handler when the handler is compiled into
> +the driver, the user requests its use via ``enable-avx2-rx=1``, CPU
> +supports AVX2, and scatter Rx is not used. To verify that the
Code doesn't check if user requested DEV_RX_OFFLOAD_SCATTER, if so perhaps
shouldn't enable vector path.
> +vectorized handler is selected, enable debug logging
> +(``--log-level=pmd,debug``) and check the following message.
> +
> +.. code-block:: console
> +
> + enic_use_vector_rx_handler use the non-scatter avx2 Rx handler
> +
> .. _enic_limitations:
>
> Limitations
> diff --git a/drivers/net/enic/Makefile b/drivers/net/enic/Makefile
> index 7c6c29cc0..3ec6f9159 100644
> --- a/drivers/net/enic/Makefile
> +++ b/drivers/net/enic/Makefile
> @@ -39,4 +39,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_intr.c
> SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rq.c
> SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rss.c
>
> +ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2)
> +# The current implementation assumes 64-bit pointers
> +ifeq ($(CONFIG_RTE_ARCH_X86_64),y)
> + SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += enic_rxtx_vec_avx2.c
> +endif
> +endif
This is not exactly enough.
CFLAGS has MACHINE_CPUFLAG based on the parameters pass the compiler. This flag
is what compiler supports.
For example if DPDK build for "default" architecture AVX2 won't be supported and
your vector path won't be build at all. Please check i40e Makefile.
> +
> include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h
> index 775cd5d55..665f5668a 100644
> --- a/drivers/net/enic/enic.h
> +++ b/drivers/net/enic/enic.h
> @@ -106,6 +106,7 @@ struct enic {
> struct vnic_dev_bar bar0;
> struct vnic_dev *vdev;
>
> + uint64_t mbuf_initializer;
A comment can be good.
<...>
> @@ -535,6 +552,21 @@ int enic_enable(struct enic *enic)
> int err;
> struct rte_eth_dev *eth_dev = enic->rte_dev;
> uint64_t simple_tx_offloads;
> + uintptr_t p;
> + struct rte_mbuf mb_def = { .buf_addr = 0 };
> +
> + /*
> + * mbuf_initializer contains const-after-init fields of
> + * receive mbufs (i.e. 64 bits of fields from rearm_data).
> + * It is currently used by the vectorized handler.
> + */
> + mb_def.nb_segs = 1;
> + mb_def.data_off = RTE_PKTMBUF_HEADROOM;
> + mb_def.port = enic->port_id;
> + rte_mbuf_refcnt_set(&mb_def, 1);
> + rte_compiler_barrier();
> + p = (uintptr_t)&mb_def.rearm_data;
> + enic->mbuf_initializer = *(uint64_t *)p;
What do you think wrapping this block with "enic->enable_avx2_rx" check, mostly
to show the usage intention?
<...>
> + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
> + PMD_INIT_LOG(DEBUG, " use the non-scatter avx2"
> + " Rx handler");
No need to split the log line.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler
2018-10-02 16:08 ` Ferruh Yigit
@ 2018-10-03 13:00 ` Hyong Youb Kim
0 siblings, 0 replies; 11+ messages in thread
From: Hyong Youb Kim @ 2018-10-03 13:00 UTC (permalink / raw)
To: Ferruh Yigit; +Cc: John Daley, dev
On Tue, Oct 02, 2018 at 05:08:21PM +0100, Ferruh Yigit wrote:
> On 9/28/2018 8:25 PM, John Daley wrote:
> > From: Hyong Youb Kim <hyonkim@cisco.com>
> >
> > Add the vectorized version of the no-scatter Rx handler. It aims to
> > process 8 descriptors per loop using AVX2 SIMD instructions. This
> > handler is in its own file enic_rxtx_vec_avx2.c, and makefile and
> > meson.build are modified to compile it when the compiler supports
> > AVX2. Under ideal conditions, the vectorized handler reduces
> > cycles/packet by more than 30%, when compared against the no-scatter
> > Rx handler. Most implementation ideas come from i40e's AVX2 based
> > handler, so credit goes to its authors.
> >
> > At this point, the new handler is meant for field trials, and is not
> > selected by default. So add a new devarg enable-avx2-rx to allow the
> > user to request the use of the new handler. When enable-avx2-rx=1, the
> > driver will consider using the new handler.
> >
> > Also update the guide doc and introduce the vectorized handler.
> >
> > Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
> > Reviewed-by: John Daley <johndale@cisco.com>
>
> <...>
>
[...]
> > +- Set ``devargs`` parameter ``enable-avx2-rx=1`` to explicitly request that
> > + PMD consider the vectorized handler when selecting the receive handler.
>
> Can be good to show a usage sample, as done in disable-overlay=1, like
> -w 12:00.0,enable-avx2-rx=1
>
Fixed.
> > +
> > + As the current implementation is intended for field trials, by default, the
> > + vectorized handler is not considerd (``enable-avx2-rx=0``).
> > +
> > +- Run on a UCS M4 or later server with CPUs that support AVX2.
> > +
> > +PMD selects the vectorized handler when the handler is compiled into
> > +the driver, the user requests its use via ``enable-avx2-rx=1``, CPU
> > +supports AVX2, and scatter Rx is not used. To verify that the
>
> Code doesn't check if user requested DEV_RX_OFFLOAD_SCATTER, if so perhaps
> shouldn't enable vector path.
>
The code below actually checks if scatter is used and selects the avx2
handler only if scatter is not used.
bool
enic_use_vector_rx_handler(struct enic *enic)
{
[...]
/* Do not support scatter Rx */
if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
return false;
For enic, data_queue_enable == 0 implies that scatter Rx is not
used. It covers two cases. First, scatter Rx offload
(DEV_RX_OFFLOAD_SCATTER) is not requested. Second, scatter offload is
requested (flag is set), but the scatter feature on the NIC is not
needed, hence not enabled because maximum receive packet
(max_rx_pkt_len) fits in one mbuf.
> > --- a/drivers/net/enic/Makefile
> > +++ b/drivers/net/enic/Makefile
> > @@ -39,4 +39,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_intr.c
> > SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rq.c
> > SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rss.c
> >
> > +ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2)
> > +# The current implementation assumes 64-bit pointers
> > +ifeq ($(CONFIG_RTE_ARCH_X86_64),y)
> > + SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += enic_rxtx_vec_avx2.c
> > +endif
> > +endif
>
> This is not exactly enough.
>
> CFLAGS has MACHINE_CPUFLAG based on the parameters pass the compiler. This flag
> is what compiler supports.
>
> For example if DPDK build for "default" architecture AVX2 won't be supported and
> your vector path won't be build at all. Please check i40e Makefile.
>
Ah, thanks for the explanation. Fixed.
> > +
> > include $(RTE_SDK)/mk/rte.lib.mk
> > diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h
> > index 775cd5d55..665f5668a 100644
> > --- a/drivers/net/enic/enic.h
> > +++ b/drivers/net/enic/enic.h
> > @@ -106,6 +106,7 @@ struct enic {
> > struct vnic_dev_bar bar0;
> > struct vnic_dev *vdev;
> >
> > + uint64_t mbuf_initializer;
>
> A comment can be good.
>
Added a comment.
> > + /*
> > + * mbuf_initializer contains const-after-init fields of
> > + * receive mbufs (i.e. 64 bits of fields from rearm_data).
> > + * It is currently used by the vectorized handler.
> > + */
> > + mb_def.nb_segs = 1;
> > + mb_def.data_off = RTE_PKTMBUF_HEADROOM;
> > + mb_def.port = enic->port_id;
> > + rte_mbuf_refcnt_set(&mb_def, 1);
> > + rte_compiler_barrier();
> > + p = (uintptr_t)&mb_def.rearm_data;
> > + enic->mbuf_initializer = *(uint64_t *)p;
>
> What do you think wrapping this block with "enic->enable_avx2_rx" check, mostly
> to show the usage intention?
>
Makes sense. Moved the block inside if.
> > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
> > + PMD_INIT_LOG(DEBUG, " use the non-scatter avx2"
> > + " Rx handler");
>
> No need to split the log line.
Fixed.
Thanks for the review. John will review the new patch and submit v4.
-Hyong
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
@ 2018-10-03 20:09 ` John Daley
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-10-04 16:15 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file Ferruh Yigit
1 sibling, 2 replies; 11+ messages in thread
From: John Daley @ 2018-10-03 20:09 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Move a number of Rx functions to the header file so that the avx2
based Rx handler can use them.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
Reviewed-by: John Daley <johndale@cisco.com>
---
v2: remove bool type from stucture (found by checkpatch)
v3: re-add Reviewed-by
v4: Address Ferruh's comments regarding doc, comment, and log message.
Fix makefile and meson.build to compile the avx2 handler when 'machine'
does not support avx2 but the compiler does
drivers/net/enic/enic_rxtx.c | 263 +---------------------------------
drivers/net/enic/enic_rxtx_common.h | 271 ++++++++++++++++++++++++++++++++++++
2 files changed, 272 insertions(+), 262 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_common.h
diff --git a/drivers/net/enic/enic_rxtx.c b/drivers/net/enic/enic_rxtx.c
index 276a2e559..5189ee635 100644
--- a/drivers/net/enic/enic_rxtx.c
+++ b/drivers/net/enic/enic_rxtx.c
@@ -11,6 +11,7 @@
#include "enic_compat.h"
#include "rq_enet_desc.h"
#include "enic.h"
+#include "enic_rxtx_common.h"
#include <rte_ether.h>
#include <rte_ip.h>
#include <rte_tcp.h>
@@ -30,268 +31,6 @@
#define rte_packet_prefetch(p) do {} while (0)
#endif
-static inline uint16_t
-enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
-}
-
-static inline uint16_t
-enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
-{
- return le16_to_cpu(crd->bytes_written_flags) &
- ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_packet_error(uint16_t bwflags)
-{
- return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
- CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_eop(uint16_t ciflags)
-{
- return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
- == CQ_ENET_RQ_DESC_FLAGS_EOP;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
-{
- return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
- CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
-{
- return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
- CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
-}
-
-static inline uint8_t
-enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
-{
- return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
- CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
-}
-
-static inline uint32_t
-enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
-{
- return le32_to_cpu(cqrd->rss_hash);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
-{
- return le16_to_cpu(cqrd->vlan);
-}
-
-static inline uint16_t
-enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- return le16_to_cpu(cqrd->bytes_written_flags) &
- CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
-}
-
-
-static inline uint8_t
-enic_cq_rx_check_err(struct cq_desc *cqd)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags;
-
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
- return 1;
- return 0;
-}
-
-/* Lookup table to translate RX CQ flags to mbuf flags. */
-static inline uint32_t
-enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint8_t cqrd_flags = cqrd->flags;
- /*
- * Odd-numbered entries are for tunnel packets. All packet type info
- * applies to the inner packet, and there is no info on the outer
- * packet. The outer flags in these entries exist only to avoid
- * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
- * afterwards.
- *
- * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
- * RTE_PTYPE_TUNNEL_GRENAT..
- */
- static const uint32_t cq_type_table[128] __rte_cache_aligned = {
- [0x00] = RTE_PTYPE_UNKNOWN,
- [0x01] = RTE_PTYPE_UNKNOWN |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER,
- [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
- [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_NONFRAG,
- [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
- [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
- [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
- [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
- [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
- RTE_PTYPE_TUNNEL_GRENAT |
- RTE_PTYPE_INNER_L2_ETHER |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG,
- /* All others reserved */
- };
- cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
- | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
- | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
- return cq_type_table[cqrd_flags + tnl];
-}
-
-static inline void
-enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
-{
- struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
- uint16_t bwflags, pkt_flags = 0, vlan_tci;
- bwflags = enic_cq_rx_desc_bwflags(cqrd);
- vlan_tci = enic_cq_rx_desc_vlan(cqrd);
-
- /* VLAN STRIPPED flag. The L2 packet type updated here also */
- if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
- pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- } else {
- if (vlan_tci != 0) {
- pkt_flags |= PKT_RX_VLAN;
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
- } else {
- mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
- }
- }
- mbuf->vlan_tci = vlan_tci;
-
- if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
- struct cq_enet_rq_clsf_desc *clsf_cqd;
- uint16_t filter_id;
- clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
- filter_id = clsf_cqd->filter_id;
- if (filter_id) {
- pkt_flags |= PKT_RX_FDIR;
- if (filter_id != ENIC_MAGIC_FILTER_ID) {
- mbuf->hash.fdir.hi = clsf_cqd->filter_id;
- pkt_flags |= PKT_RX_FDIR_ID;
- }
- }
- } else if (enic_cq_rx_desc_rss_type(cqrd)) {
- /* RSS flag */
- pkt_flags |= PKT_RX_RSS_HASH;
- mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
- }
-
- /* checksum flags */
- if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
- if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
- uint32_t l4_flags;
- l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
-
- /*
- * When overlay offload is enabled, the NIC may
- * set ipv4_csum_ok=1 if the inner packet is IPv6..
- * So, explicitly check for IPv4 before checking
- * ipv4_csum_ok.
- */
- if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
- if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
- pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_IP_CKSUM_BAD;
- }
-
- if (l4_flags == RTE_PTYPE_L4_UDP ||
- l4_flags == RTE_PTYPE_L4_TCP) {
- if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
- pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
- else
- pkt_flags |= PKT_RX_L4_CKSUM_BAD;
- }
- }
- }
-
- mbuf->ol_flags = pkt_flags;
-}
-
/* dummy receive function to replace actual function in
* order to do safe reconfiguration operations.
*/
diff --git a/drivers/net/enic/enic_rxtx_common.h b/drivers/net/enic/enic_rxtx_common.h
new file mode 100644
index 000000000..bfbb4909e
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_common.h
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#ifndef _ENIC_RXTX_COMMON_H_
+#define _ENIC_RXTX_COMMON_H_
+
+static inline uint16_t
+enic_cq_rx_desc_ciflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->completed_index_flags) & ~CQ_DESC_COMP_NDX_MASK;
+}
+
+static inline uint16_t
+enic_cq_rx_desc_bwflags(struct cq_enet_rq_desc *crd)
+{
+ return le16_to_cpu(crd->bytes_written_flags) &
+ ~CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_packet_error(uint16_t bwflags)
+{
+ return (bwflags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ==
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_eop(uint16_t ciflags)
+{
+ return (ciflags & CQ_ENET_RQ_DESC_FLAGS_EOP)
+ == CQ_ENET_RQ_DESC_FLAGS_EOP;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_csum_not_calc(struct cq_enet_rq_desc *cqrd)
+{
+ return (le16_to_cpu(cqrd->q_number_rss_type_flags) &
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ==
+ CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_ipv4_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_tcp_udp_csum_ok(struct cq_enet_rq_desc *cqrd)
+{
+ return (cqrd->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ==
+ CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK;
+}
+
+static inline uint8_t
+enic_cq_rx_desc_rss_type(struct cq_enet_rq_desc *cqrd)
+{
+ return (uint8_t)((le16_to_cpu(cqrd->q_number_rss_type_flags) >>
+ CQ_DESC_Q_NUM_BITS) & CQ_ENET_RQ_DESC_RSS_TYPE_MASK);
+}
+
+static inline uint32_t
+enic_cq_rx_desc_rss_hash(struct cq_enet_rq_desc *cqrd)
+{
+ return le32_to_cpu(cqrd->rss_hash);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_vlan(struct cq_enet_rq_desc *cqrd)
+{
+ return le16_to_cpu(cqrd->vlan);
+}
+
+static inline uint16_t
+enic_cq_rx_desc_n_bytes(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ return le16_to_cpu(cqrd->bytes_written_flags) &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+}
+
+
+static inline uint8_t
+enic_cq_rx_check_err(struct cq_desc *cqd)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags;
+
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ if (unlikely(enic_cq_rx_desc_packet_error(bwflags)))
+ return 1;
+ return 0;
+}
+
+/* Lookup table to translate RX CQ flags to mbuf flags. */
+static uint32_t
+enic_cq_rx_flags_to_pkt_type(struct cq_desc *cqd, uint8_t tnl)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint8_t cqrd_flags = cqrd->flags;
+ /*
+ * Odd-numbered entries are for tunnel packets. All packet type info
+ * applies to the inner packet, and there is no info on the outer
+ * packet. The outer flags in these entries exist only to avoid
+ * changing enic_cq_rx_to_pkt_flags(). They are cleared from mbuf
+ * afterwards.
+ *
+ * Also, as there is no tunnel type info (VXLAN, NVGRE, or GENEVE), set
+ * RTE_PTYPE_TUNNEL_GRENAT..
+ */
+ static const uint32_t cq_type_table[128] __rte_cache_aligned = {
+ [0x00] = RTE_PTYPE_UNKNOWN,
+ [0x01] = RTE_PTYPE_UNKNOWN |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER,
+ [0x20] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x21] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x22] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x23] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x24] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x25] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x60] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x61] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x62] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x63] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x64] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x65] = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x10] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG,
+ [0x11] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x12] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP,
+ [0x13] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0x14] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP,
+ [0x15] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x50] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x51] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x52] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x53] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x54] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG,
+ [0x55] = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG |
+ RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ /* All others reserved */
+ };
+ cqrd_flags &= CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT
+ | CQ_ENET_RQ_DESC_FLAGS_IPV4 | CQ_ENET_RQ_DESC_FLAGS_IPV6
+ | CQ_ENET_RQ_DESC_FLAGS_TCP | CQ_ENET_RQ_DESC_FLAGS_UDP;
+ return cq_type_table[cqrd_flags + tnl];
+}
+
+static void
+enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
+{
+ struct cq_enet_rq_desc *cqrd = (struct cq_enet_rq_desc *)cqd;
+ uint16_t bwflags, pkt_flags = 0, vlan_tci;
+ bwflags = enic_cq_rx_desc_bwflags(cqrd);
+ vlan_tci = enic_cq_rx_desc_vlan(cqrd);
+
+ /* VLAN STRIPPED flag. The L2 packet type updated here also */
+ if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
+ pkt_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ } else {
+ if (vlan_tci != 0) {
+ pkt_flags |= PKT_RX_VLAN;
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER_VLAN;
+ } else {
+ mbuf->packet_type |= RTE_PTYPE_L2_ETHER;
+ }
+ }
+ mbuf->vlan_tci = vlan_tci;
+
+ if ((cqd->type_color & CQ_DESC_TYPE_MASK) == CQ_DESC_TYPE_CLASSIFIER) {
+ struct cq_enet_rq_clsf_desc *clsf_cqd;
+ uint16_t filter_id;
+ clsf_cqd = (struct cq_enet_rq_clsf_desc *)cqd;
+ filter_id = clsf_cqd->filter_id;
+ if (filter_id) {
+ pkt_flags |= PKT_RX_FDIR;
+ if (filter_id != ENIC_MAGIC_FILTER_ID) {
+ mbuf->hash.fdir.hi = clsf_cqd->filter_id;
+ pkt_flags |= PKT_RX_FDIR_ID;
+ }
+ }
+ } else if (enic_cq_rx_desc_rss_type(cqrd)) {
+ /* RSS flag */
+ pkt_flags |= PKT_RX_RSS_HASH;
+ mbuf->hash.rss = enic_cq_rx_desc_rss_hash(cqrd);
+ }
+
+ /* checksum flags */
+ if (mbuf->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV6)) {
+ if (!enic_cq_rx_desc_csum_not_calc(cqrd)) {
+ uint32_t l4_flags;
+ l4_flags = mbuf->packet_type & RTE_PTYPE_L4_MASK;
+
+ /*
+ * When overlay offload is enabled, the NIC may
+ * set ipv4_csum_ok=1 if the inner packet is IPv6..
+ * So, explicitly check for IPv4 before checking
+ * ipv4_csum_ok.
+ */
+ if (mbuf->packet_type & RTE_PTYPE_L3_IPV4) {
+ if (enic_cq_rx_desc_ipv4_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_IP_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_IP_CKSUM_BAD;
+ }
+
+ if (l4_flags == RTE_PTYPE_L4_UDP ||
+ l4_flags == RTE_PTYPE_L4_TCP) {
+ if (enic_cq_rx_desc_tcp_udp_csum_ok(cqrd))
+ pkt_flags |= PKT_RX_L4_CKSUM_GOOD;
+ else
+ pkt_flags |= PKT_RX_L4_CKSUM_BAD;
+ }
+ }
+ }
+
+ mbuf->ol_flags = pkt_flags;
+}
+
+#endif /* _ENIC_RXTX_COMMON_H_ */
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [dpdk-dev] [PATCH v4 2/2] net/enic: add AVX2 based vectorized Rx handler
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file John Daley
@ 2018-10-03 20:09 ` John Daley
2018-10-04 16:15 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file Ferruh Yigit
1 sibling, 0 replies; 11+ messages in thread
From: John Daley @ 2018-10-03 20:09 UTC (permalink / raw)
To: ferruh.yigit; +Cc: dev, Hyong Youb Kim
From: Hyong Youb Kim <hyonkim@cisco.com>
Add the vectorized version of the no-scatter Rx handler. It aims to
process 8 descriptors per loop using AVX2 SIMD instructions. This
handler is in its own file enic_rxtx_vec_avx2.c, and makefile and
meson.build are modified to compile it when the compiler supports
AVX2. Under ideal conditions, the vectorized handler reduces
cycles/packet by more than 30%, when compared against the no-scatter
Rx handler. Most implementation ideas come from i40e's AVX2 based
handler, so credit goes to its authors.
At this point, the new handler is meant for field trials, and is not
selected by default. So add a new devarg enable-avx2-rx to allow the
user to request the use of the new handler. When enable-avx2-rx=1, the
driver will consider using the new handler.
Also update the guide doc and introduce the vectorized handler.
Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
Reviewed-by: John Daley <johndale@cisco.com>
---
v2: remove bool type from stucture (found by checkpatch)
v3: re-add Reviewed-by
v4: Address Ferruh's comments regarding doc, comment, and log message.
Fix makefile and meson.build to compile the avx2 handler when 'machine'
does not support avx2 but the compiler does
doc/guides/nics/enic.rst | 35 ++
drivers/net/enic/Makefile | 28 ++
drivers/net/enic/enic.h | 7 +
drivers/net/enic/enic_ethdev.c | 27 +-
drivers/net/enic/enic_main.c | 37 +-
drivers/net/enic/enic_rxtx_vec_avx2.c | 831 ++++++++++++++++++++++++++++++++++
drivers/net/enic/meson.build | 16 +
7 files changed, 972 insertions(+), 9 deletions(-)
create mode 100644 drivers/net/enic/enic_rxtx_vec_avx2.c
diff --git a/doc/guides/nics/enic.rst b/doc/guides/nics/enic.rst
index 1f2951ea9..623b26247 100644
--- a/doc/guides/nics/enic.rst
+++ b/doc/guides/nics/enic.rst
@@ -351,6 +351,41 @@ suitable for others. Such applications may change the mode by setting
applications such as OVS-DPDK performance benchmarks that utilize
only the default VLAN and want to see only untagged packets.
+
+Vectorized Rx Handler
+---------------------
+
+ENIC PMD includes a version of the receive handler that is vectorized using
+AVX2 SIMD instructions. It is meant for bulk, throughput oriented workloads
+where reducing cycles/packet in PMD is a priority. In order to use the
+vectorized handler, take the following steps.
+
+- Use a recent version of gcc, icc, or clang and build 64-bit DPDK. If
+ the compiler is known to support AVX2, DPDK build system
+ automatically compiles the vectorized handler. Otherwise, the
+ handler is not available.
+
+- Set ``devargs`` parameter ``enable-avx2-rx=1`` to explicitly request that
+ PMD consider the vectorized handler when selecting the receive handler.
+ For example::
+
+ -w 12:00.0,enable-avx2-rx=1
+
+ As the current implementation is intended for field trials, by default, the
+ vectorized handler is not considerd (``enable-avx2-rx=0``).
+
+- Run on a UCS M4 or later server with CPUs that support AVX2.
+
+PMD selects the vectorized handler when the handler is compiled into
+the driver, the user requests its use via ``enable-avx2-rx=1``, CPU
+supports AVX2, and scatter Rx is not used. To verify that the
+vectorized handler is selected, enable debug logging
+(``--log-level=pmd,debug``) and check the following message.
+
+.. code-block:: console
+
+ enic_use_vector_rx_handler use the non-scatter avx2 Rx handler
+
.. _enic_limitations:
Limitations
diff --git a/drivers/net/enic/Makefile b/drivers/net/enic/Makefile
index 7c6c29cc0..e39e47631 100644
--- a/drivers/net/enic/Makefile
+++ b/drivers/net/enic/Makefile
@@ -39,4 +39,32 @@ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_intr.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rq.c
SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += base/vnic_rss.c
+# The current implementation assumes 64-bit pointers
+CC_AVX2_SUPPORT=0
+ifeq ($(CONFIG_RTE_ARCH_X86_64),y)
+# Figure out if the compiler supports avx2. The extra check using
+# -march=core-avx2 is necessary to support users who build for the
+# 'default' machine (corei7 which has no avx2) and run the binary on
+# newer CPUs that have avx2.
+# This part is verbatim from i40e makefile.
+ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2)
+ CC_AVX2_SUPPORT=1
+else
+ CC_AVX2_SUPPORT=\
+ $(shell $(CC) -march=core-avx2 -dM -E - </dev/null 2>&1 | \
+ grep -q AVX2 && echo 1)
+ ifeq ($(CC_AVX2_SUPPORT), 1)
+ ifeq ($(CONFIG_RTE_TOOLCHAIN_ICC),y)
+ CFLAGS_enic_rxtx_vec_avx2.o += -march=core-avx2
+ else
+ CFLAGS_enic_rxtx_vec_avx2.o += -mavx2
+ endif
+ endif
+endif
+endif
+
+ifeq ($(CC_AVX2_SUPPORT), 1)
+ SRCS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += enic_rxtx_vec_avx2.c
+endif
+
include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h
index 775cd5d55..e5f4d3b26 100644
--- a/drivers/net/enic/enic.h
+++ b/drivers/net/enic/enic.h
@@ -106,6 +106,11 @@ struct enic {
struct vnic_dev_bar bar0;
struct vnic_dev *vdev;
+ /*
+ * mbuf_initializer contains 64 bits of mbuf rearm_data, used by
+ * the avx2 handler at this time.
+ */
+ uint64_t mbuf_initializer;
unsigned int port_id;
bool overlay_offload;
struct rte_eth_dev *rte_dev;
@@ -128,6 +133,7 @@ struct enic {
u8 filter_actions; /* HW supported actions */
bool vxlan;
bool disable_overlay; /* devargs disable_overlay=1 */
+ uint8_t enable_avx2_rx; /* devargs enable-avx2-rx=1 */
bool nic_cfg_chk; /* NIC_CFG_CHK available */
bool udp_rss_weak; /* Bodega style UDP RSS */
uint8_t ig_vlan_rewrite_mode; /* devargs ig-vlan-rewrite */
@@ -329,6 +335,7 @@ uint16_t enic_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
int enic_set_mtu(struct enic *enic, uint16_t new_mtu);
int enic_link_update(struct enic *enic);
+bool enic_use_vector_rx_handler(struct enic *enic);
void enic_fdir_info(struct enic *enic);
void enic_fdir_info_get(struct enic *enic, struct rte_eth_fdir_info *stats);
void copy_fltr_v1(struct filter_v2 *fltr, struct rte_eth_fdir_input *input,
diff --git a/drivers/net/enic/enic_ethdev.c b/drivers/net/enic/enic_ethdev.c
index 65333c47a..4d450fe0c 100644
--- a/drivers/net/enic/enic_ethdev.c
+++ b/drivers/net/enic/enic_ethdev.c
@@ -37,6 +37,7 @@ static const struct rte_pci_id pci_id_enic_map[] = {
};
#define ENIC_DEVARG_DISABLE_OVERLAY "disable-overlay"
+#define ENIC_DEVARG_ENABLE_AVX2_RX "enable-avx2-rx"
#define ENIC_DEVARG_IG_VLAN_REWRITE "ig-vlan-rewrite"
RTE_INIT(enicpmd_init_log)
@@ -915,22 +916,27 @@ static const struct eth_dev_ops enicpmd_eth_dev_ops = {
.udp_tunnel_port_del = enicpmd_dev_udp_tunnel_port_del,
};
-static int enic_parse_disable_overlay(__rte_unused const char *key,
- const char *value,
- void *opaque)
+static int enic_parse_zero_one(const char *key,
+ const char *value,
+ void *opaque)
{
struct enic *enic;
+ bool b;
enic = (struct enic *)opaque;
if (strcmp(value, "0") == 0) {
- enic->disable_overlay = false;
+ b = false;
} else if (strcmp(value, "1") == 0) {
- enic->disable_overlay = true;
+ b = true;
} else {
- dev_err(enic, "Invalid value for " ENIC_DEVARG_DISABLE_OVERLAY
- ": expected=0|1 given=%s\n", value);
+ dev_err(enic, "Invalid value for %s"
+ ": expected=0|1 given=%s\n", key, value);
return -EINVAL;
}
+ if (strcmp(key, ENIC_DEVARG_DISABLE_OVERLAY) == 0)
+ enic->disable_overlay = b;
+ if (strcmp(key, ENIC_DEVARG_ENABLE_AVX2_RX) == 0)
+ enic->enable_avx2_rx = b;
return 0;
}
@@ -971,6 +977,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
{
static const char *const valid_keys[] = {
ENIC_DEVARG_DISABLE_OVERLAY,
+ ENIC_DEVARG_ENABLE_AVX2_RX,
ENIC_DEVARG_IG_VLAN_REWRITE,
NULL};
struct enic *enic = pmd_priv(dev);
@@ -979,6 +986,7 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
ENICPMD_FUNC_TRACE();
enic->disable_overlay = false;
+ enic->enable_avx2_rx = false;
enic->ig_vlan_rewrite_mode = IG_VLAN_REWRITE_MODE_PASS_THRU;
if (!dev->device->devargs)
return 0;
@@ -986,7 +994,9 @@ static int enic_check_devargs(struct rte_eth_dev *dev)
if (!kvlist)
return -EINVAL;
if (rte_kvargs_process(kvlist, ENIC_DEVARG_DISABLE_OVERLAY,
- enic_parse_disable_overlay, enic) < 0 ||
+ enic_parse_zero_one, enic) < 0 ||
+ rte_kvargs_process(kvlist, ENIC_DEVARG_ENABLE_AVX2_RX,
+ enic_parse_zero_one, enic) < 0 ||
rte_kvargs_process(kvlist, ENIC_DEVARG_IG_VLAN_REWRITE,
enic_parse_ig_vlan_rewrite, enic) < 0) {
rte_kvargs_free(kvlist);
@@ -1055,4 +1065,5 @@ RTE_PMD_REGISTER_PCI_TABLE(net_enic, pci_id_enic_map);
RTE_PMD_REGISTER_KMOD_DEP(net_enic, "* igb_uio | uio_pci_generic | vfio-pci");
RTE_PMD_REGISTER_PARAM_STRING(net_enic,
ENIC_DEVARG_DISABLE_OVERLAY "=0|1 "
+ ENIC_DEVARG_ENABLE_AVX2_RX "=0|1 "
ENIC_DEVARG_IG_VLAN_REWRITE "=trunk|untag|priority|pass");
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index ea6cddbd3..3eced2ce2 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -514,12 +514,29 @@ static void enic_prep_wq_for_simple_tx(struct enic *enic, uint16_t queue_idx)
}
}
+/*
+ * The 'strong' version is in enic_rxtx_vec_avx2.c. This weak version is used
+ * used when that file is not compiled.
+ */
+bool __attribute__((weak))
+enic_use_vector_rx_handler(__rte_unused struct enic *enic)
+{
+ return false;
+}
+
static void pick_rx_handler(struct enic *enic)
{
struct rte_eth_dev *eth_dev;
- /* Use the non-scatter, simplified RX handler if possible. */
+ /*
+ * Preference order:
+ * 1. The vectorized handler if possible and requested.
+ * 2. The non-scatter, simplified handler if scatter Rx is not used.
+ * 3. The default handler as a fallback.
+ */
eth_dev = enic->rte_dev;
+ if (enic_use_vector_rx_handler(enic))
+ return;
if (enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0) {
PMD_INIT_LOG(DEBUG, " use the non-scatter Rx handler");
eth_dev->rx_pkt_burst = &enic_noscatter_recv_pkts;
@@ -535,6 +552,24 @@ int enic_enable(struct enic *enic)
int err;
struct rte_eth_dev *eth_dev = enic->rte_dev;
uint64_t simple_tx_offloads;
+ uintptr_t p;
+
+ if (enic->enable_avx2_rx) {
+ struct rte_mbuf mb_def = { .buf_addr = 0 };
+
+ /*
+ * mbuf_initializer contains const-after-init fields of
+ * receive mbufs (i.e. 64 bits of fields from rearm_data).
+ * It is currently used by the vectorized handler.
+ */
+ mb_def.nb_segs = 1;
+ mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+ mb_def.port = enic->port_id;
+ rte_mbuf_refcnt_set(&mb_def, 1);
+ rte_compiler_barrier();
+ p = (uintptr_t)&mb_def.rearm_data;
+ enic->mbuf_initializer = *(uint64_t *)p;
+ }
eth_dev->data->dev_link.link_speed = vnic_dev_port_speed(enic->vdev);
eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
diff --git a/drivers/net/enic/enic_rxtx_vec_avx2.c b/drivers/net/enic/enic_rxtx_vec_avx2.c
new file mode 100644
index 000000000..d21854901
--- /dev/null
+++ b/drivers/net/enic/enic_rxtx_vec_avx2.c
@@ -0,0 +1,831 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
+ * Copyright 2007 Nuova Systems, Inc. All rights reserved.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+
+#include "enic_compat.h"
+#include "rq_enet_desc.h"
+#include "enic.h"
+#include "enic_rxtx_common.h"
+
+#include <x86intrin.h>
+
+static struct rte_mbuf *
+rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
+{
+ bool tnl;
+
+ *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
+ mb->data_len = cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
+ mb->pkt_len = mb->data_len;
+ tnl = enic->overlay_offload && (cqd->completed_index_flags &
+ CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
+ mb->packet_type =
+ enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
+ enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
+ /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
+ if (tnl) {
+ mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
+ RTE_PTYPE_L4_MASK);
+ }
+ return mb;
+}
+
+static uint16_t
+enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct rte_mbuf **rx, **rxmb;
+ uint16_t cq_idx, nb_rx, max_rx;
+ struct cq_enet_rq_desc *cqd;
+ struct rq_enet_desc *rqd;
+ struct vnic_cq *cq;
+ struct vnic_rq *rq;
+ struct enic *enic;
+ uint8_t color;
+
+ rq = rx_queue;
+ enic = vnic_dev_priv(rq->vdev);
+ cq = &enic->cq[enic_cq_rq(enic, rq->index)];
+ cq_idx = cq->to_clean;
+
+ /*
+ * Fill up the reserve of free mbufs. Below, we restock the receive
+ * ring with these mbufs to avoid allocation failures.
+ */
+ if (rq->num_free_mbufs == 0) {
+ if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
+ ENIC_RX_BURST_MAX))
+ return 0;
+ rq->num_free_mbufs = ENIC_RX_BURST_MAX;
+ }
+ /* Receive until the end of the ring, at most. */
+ max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
+ max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
+
+ rxmb = rq->mbuf_ring + cq_idx;
+ color = cq->last_color;
+ cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
+ rx = rx_pkts;
+ if (max_rx == 0 ||
+ (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
+ return 0;
+
+ /* Step 1: Process one packet to do aligned 256-bit load below */
+ if (cq_idx & 0x1) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ const __m256i mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0, /* completed_index_flags */
+ /* First descriptor */
+ 0xff, /* type_color */
+ (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
+ CQ_ENET_RQ_DESC_FLAGS_IPV4 |
+ CQ_ENET_RQ_DESC_FLAGS_IPV6 |
+ CQ_ENET_RQ_DESC_FLAGS_TCP |
+ CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
+ 0, 0, /* checksum_fcoe */
+ 0xff, 0xff, /* vlan */
+ 0x3f, 0xff, /* bytes_written_flags */
+ 0xff, 0xff, 0xff, 0xff, /* rss_hash */
+ 0xff, 0xff, /* q_number_rss_type_flags */
+ 0, 0 /* completed_index_flags */
+ );
+ const __m256i shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
+ /* First descriptor */
+ 7, 6, 5, 4, /* rss = rss_hash */
+ 11, 10, /* vlan_tci = vlan */
+ 9, 8, /* data_len = bytes_written */
+ 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
+ 0x80, 0x80, 0x80, 0x80 /* packet_type = 0 */
+ );
+ /* Used to collect 8 flags from 8 desc into one register */
+ const __m256i flags_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /* First descriptor */
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ 1, 3, 9, 14,
+ /*
+ * Byte 3: upper byte of completed_index_flags
+ * bit 5 = fcoe (tunnel)
+ * Byte 2: upper byte of q_number_rss_type_flags
+ * bits 2,3,4,5 = rss type
+ * bit 6 = csum_not_calc
+ * Byte 1: upper byte of bytes_written_flags
+ * bit 6 = truncated
+ * bit 7 = vlan stripped
+ * Byte 0: flags
+ */
+ 1, 3, 9, 14
+ );
+ /* Used to collect 8 VLAN IDs from 8 desc into one register */
+ const __m256i vlan_shuffle_mask =
+ _mm256_set_epi8(/* Second descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ /* First descriptor */
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10,
+ 0x80, 0x80, 11, 10);
+ /* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
+ const __m256i rss_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0, /* rss_types = 0 */
+ /* first 128 bits */
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
+ 0 /* rss_types = 0 */);
+ /*
+ * VLAN offload flags.
+ * shuffle index:
+ * vlan_stripped => bit 0
+ * vlan_id == 0 => bit 1
+ */
+ const __m256i vlan_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+ PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN);
+ /* Use the same shuffle index as vlan_shuffle */
+ const __m256i vlan_ptype_shuffle =
+ _mm256_set_epi32(0, 0, 0, 0,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L2_ETHER_VLAN);
+ /*
+ * CKSUM flags. Shift right so they fit int 8-bit integers.
+ * shuffle index:
+ * ipv4_csum_ok => bit 3
+ * ip4 => bit 2
+ * tcp_or_udp => bit 1
+ * tcp_udp_csum_ok => bit 0
+ */
+ const __m256i csum_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ /* 1111 ip4+ip4_ok+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 1110 ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */
+ (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 1010 l4+!l4_ok */
+ 0, /* 1001 */
+ 0, /* 1000 */
+ /* 0111 !ip4_ok+ip4+l4+l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ /* 0110 !ip4_ok+ip4+l4+!l4_ok */
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0101 !ip4_ok+ip4 */
+ (PKT_RX_IP_CKSUM_BAD >> 1), /* 0100 !ip4_ok+ip4 */
+ (PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */
+ (PKT_RX_L4_CKSUM_BAD >> 1), /* 0010 l4+!l4_ok */
+ 0, /* 0001 */
+ 0, /* 0000 */
+ /* first 128 bits */
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_IP_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0,
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+ ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_IP_CKSUM_BAD >> 1),
+ (PKT_RX_L4_CKSUM_GOOD >> 1),
+ (PKT_RX_L4_CKSUM_BAD >> 1),
+ 0, 0);
+ /*
+ * Non-fragment PTYPEs.
+ * Shuffle 4-bit index:
+ * ip6 => bit 0
+ * ip4 => bit 1
+ * udp => bit 2
+ * tcp => bit 3
+ * bit
+ * 3 2 1 0
+ * -------
+ * 0 0 0 0 unknown
+ * 0 0 0 1 ip6 | nonfrag
+ * 0 0 1 0 ip4 | nonfrag
+ * 0 0 1 1 unknown
+ * 0 1 0 0 unknown
+ * 0 1 0 1 ip6 | udp
+ * 0 1 1 0 ip4 | udp
+ * 0 1 1 1 unknown
+ * 1 0 0 0 unknown
+ * 1 0 0 1 ip6 | tcp
+ * 1 0 1 0 ip4 | tcp
+ * 1 0 1 1 unknown
+ * 1 1 0 0 unknown
+ * 1 1 0 1 unknown
+ * 1 1 1 0 unknown
+ * 1 1 1 1 unknown
+ *
+ * PTYPEs do not fit in 8 bits, so shift right 4..
+ */
+ const __m256i nonfrag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /* Fragment PTYPEs. Use the same shuffle index as above. */
+ const __m256i frag_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG) >> 4,
+ RTE_PTYPE_UNKNOWN);
+ /*
+ * Tunnel PTYPEs. Use the same shuffle index as above.
+ * L4 types are not part of this table. They come from non-tunnel
+ * types above.
+ */
+ const __m256i tnl_l3_ptype_shuffle =
+ _mm256_set_epi8(/* second 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN,
+ /* first 128 bits */
+ RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
+ RTE_PTYPE_UNKNOWN);
+
+ const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
+ 0, enic->mbuf_initializer);
+
+ /*
+ * --- cq desc fields --- offset
+ * completed_index_flags - 0 use: fcoe
+ * q_number_rss_type_flags - 2 use: rss types, csum_not_calc
+ * rss_hash - 4 ==> mbuf.hash.rss
+ * bytes_written_flags - 8 ==> mbuf.pkt_len,data_len
+ * use: truncated, vlan_stripped
+ * vlan - 10 ==> mbuf.vlan_tci
+ * checksum_fcoe - 12 (unused)
+ * flags - 14 use: all bits
+ * type_color - 15 (unused)
+ *
+ * --- mbuf fields --- offset
+ * rearm_data ---- 16
+ * data_off - 0 (mbuf_init) -+
+ * refcnt - 2 (mbuf_init) |
+ * nb_segs - 4 (mbuf_init) | 16B 128b
+ * port - 6 (mbuf_init) |
+ * ol_flag - 8 (from cqd) -+
+ * rx_descriptor_fields1 ---- 32
+ * packet_type - 0 (from cqd) -+
+ * pkt_len - 4 (from cqd) |
+ * data_len - 8 (from cqd) | 16B 128b
+ * vlan_tci - 10 (from cqd) |
+ * rss - 12 (from cqd) -+
+ */
+
+ __m256i overlay_enabled =
+ _mm256_set1_epi32((uint32_t)enic->overlay_offload);
+
+ /* Step 2: Process 8 packets per loop using SIMD */
+ while (max_rx > 7 && (((cqd + 7)->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ /* Load 8 16B CQ descriptors */
+ __m256i cqd01 = _mm256_load_si256((void *)cqd);
+ __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
+ __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
+ __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
+ /* Copy 8 mbuf pointers to rx_pkts */
+ _mm256_storeu_si256((void *)rx,
+ _mm256_loadu_si256((void *)rxmb));
+ _mm256_storeu_si256((void *)(rx + 4),
+ _mm256_loadu_si256((void *)(rxmb + 4)));
+
+ /*
+ * Collect 8 flags (each 32 bits) into one register.
+ * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
+ */
+ __m256i flags01 =
+ _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
+ /*
+ * Shuffle above produces 8 x 32-bit flags for 8 descriptors
+ * in this order: 0, 0, 0, 0, 1, 1, 1, 1
+ * The duplicates in each 128-bit lane simplifies blending
+ * below.
+ */
+ __m256i flags23 =
+ _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
+ __m256i flags45 =
+ _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
+ __m256i flags67 =
+ _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
+ /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
+ __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
+ /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
+ __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
+ /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
+ /*
+ * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
+ * This order simplifies blend operations way below that
+ * produce 'rearm' data for each mbuf.
+ */
+ flags0_7 = _mm256_permute4x64_epi64(flags0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Check truncated bits and bail out early on.
+ * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
+ */
+ __m256i trunc =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
+ trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2));
+ /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
+ if (_mm256_extract_epi64(trunc, 0) ||
+ _mm256_extract_epi64(trunc, 1))
+ break;
+
+ /*
+ * Compute PKT_RX_RSS_HASH.
+ * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
+ * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_types =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
+ /*
+ * RSS flags (PKT_RX_RSS_HASH) are in
+ * byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
+
+ /*
+ * Compute CKSUM flags. First build the index and then
+ * use it to shuffle csum_shuffle.
+ * 20 instructions including const loads: 2.5 inst/desc
+ */
+ /*
+ * csum_not_calc (bit 22)
+ * csum_not_calc (0) => 0xffffffff
+ * csum_not_calc (1) => 0x0
+ */
+ const __m256i zero4 = _mm256_setzero_si256();
+ const __m256i mask22 = _mm256_set1_epi32(0x400000);
+ __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
+ _mm256_and_si256(flags0_7, mask22));
+ /*
+ * (tcp|udp) && !fragment => bit 1
+ * tcp = bit 2, udp = bit 1, frag = bit 6
+ */
+ const __m256i mask1 = _mm256_set1_epi32(0x2);
+ __m256i tcp_udp =
+ _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
+ _mm256_or_si256(flags0_7,
+ _mm256_srli_epi32(flags0_7, 1)));
+ tcp_udp = _mm256_and_si256(tcp_udp, mask1);
+ /* ipv4 (bit 5) => bit 2 */
+ const __m256i mask2 = _mm256_set1_epi32(0x4);
+ __m256i ipv4 = _mm256_and_si256(mask2,
+ _mm256_srli_epi32(flags0_7, 3));
+ /*
+ * ipv4_csum_ok (bit 3) => bit 3
+ * tcp_udp_csum_ok (bit 0) => bit 0
+ * 0x9
+ */
+ const __m256i mask0_3 = _mm256_set1_epi32(0x9);
+ __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
+ csum_idx = _mm256_and_si256(csum_not_calc,
+ _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
+ tcp_udp));
+ __m256i csum_flags =
+ _mm256_shuffle_epi8(csum_shuffle, csum_idx);
+ /* Shift left to restore CKSUM flags. See csum_shuffle. */
+ csum_flags = _mm256_slli_epi32(csum_flags, 1);
+ /* Combine csum flags and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, csum_flags);
+
+ /*
+ * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
+ * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
+ * 1.25 inst/desc
+ */
+ __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
+ __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
+ __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
+ __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
+ __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
+ __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
+ /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
+ __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
+ /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
+ vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /*
+ * Compare 0 == vlan_id produces 0xffffffff (-1) if
+ * vlan 0 and 0 if vlan non-0. Then subtracting the
+ * result from 0 produces 0 - (-1) = 1 for vlan 0, and
+ * 0 - 0 = 0 for vlan non-0.
+ */
+ vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
+ /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
+ vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
+
+ /*
+ * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED.
+ * Use 3 shifts, 1 or, 1 shuffle for 8 desc: 0.625 inst/desc
+ * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
+ * Everything else is zero.
+ */
+ __m256i vlan_idx =
+ _mm256_or_si256(/* vlan_stripped => bit 0 */
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
+ 16), 31),
+ /* (vlan_id == 0) => bit 1 */
+ _mm256_slli_epi32(vlan0_7, 1));
+ /*
+ * The index captures 4 cases.
+ * stripped, id = 0 ==> 11b = 3
+ * stripped, id != 0 ==> 01b = 1
+ * not strip, id == 0 ==> 10b = 2
+ * not strip, id != 0 ==> 00b = 0
+ */
+ __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
+ vlan_idx);
+ /* Combine vlan and offload flags: 0.125 inst/desc */
+ rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
+
+ /*
+ * Compute non-tunnel PTYPEs.
+ * 17 inst / 8 desc = 2.125 inst/desc
+ */
+ /* ETHER and ETHER_VLAN */
+ __m256i vlan_ptype =
+ _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
+ vlan_idx);
+ /* Build the ptype index from flags */
+ tcp_udp = _mm256_slli_epi32(flags0_7, 29);
+ tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
+ __m256i ip4_ip6 =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
+ __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
+ __m256i frag_bit =
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
+ __m256i nonfrag_ptype =
+ _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
+ __m256i frag_ptype =
+ _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
+ /*
+ * Zero out the unwanted types and combine the remaining bits.
+ * The effect is same as selecting non-frag or frag types
+ * depending on the frag bit.
+ */
+ nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
+ _mm256_cmpeq_epi32(zero4, frag_bit));
+ frag_ptype = _mm256_and_si256(frag_ptype,
+ _mm256_cmpgt_epi32(frag_bit, zero4));
+ __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
+ ptype = _mm256_slli_epi32(ptype, 4);
+ /*
+ * Compute tunnel PTYPEs.
+ * 15 inst / 8 desc = 1.875 inst/desc
+ */
+ __m256i tnl_l3_ptype =
+ _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
+ tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
+ /*
+ * Shift non-tunnel L4 types to make them tunnel types.
+ * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
+ */
+ __m256i tnl_l4_ptype =
+ _mm256_slli_epi32(_mm256_and_si256(ptype,
+ _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
+ __m256i tnl_ptype =
+ _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
+ tnl_ptype = _mm256_or_si256(tnl_ptype,
+ _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
+ RTE_PTYPE_INNER_L2_ETHER));
+ /*
+ * Select non-tunnel or tunnel types by zeroing out the
+ * unwanted ones.
+ */
+ __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
+ _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
+ tnl_ptype = _mm256_and_si256(tnl_ptype,
+ _mm256_sub_epi32(zero4, tnl_flags));
+ ptype = _mm256_and_si256(ptype,
+ _mm256_cmpeq_epi32(zero4, tnl_flags));
+ /*
+ * Combine types and swap to have ptypes in the same order
+ * as desc.
+ * desc: 0 2 4 6 1 3 5 7
+ * 3 inst / 8 desc = 0.375 inst/desc
+ */
+ ptype = _mm256_or_si256(ptype, tnl_ptype);
+ ptype = _mm256_or_si256(ptype, vlan_ptype);
+ ptype = _mm256_permute4x64_epi64(ptype,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Mask packet length.
+ * Use 4 ands: 0.5 instructions/desc
+ */
+ cqd01 = _mm256_and_si256(cqd01, mask);
+ cqd23 = _mm256_and_si256(cqd23, mask);
+ cqd45 = _mm256_and_si256(cqd45, mask);
+ cqd67 = _mm256_and_si256(cqd67, mask);
+ /*
+ * Shuffle. Two 16B sets of the mbuf fields.
+ * packet_type, pkt_len, data_len, vlan_tci, rss
+ */
+ __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
+ __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
+ __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
+ __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
+
+ /*
+ * Blend in ptypes
+ * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
+ */
+ rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
+ rearm23 = _mm256_blend_epi32(rearm23,
+ _mm256_shuffle_epi32(ptype, 1), 0x11);
+ rearm45 = _mm256_blend_epi32(rearm45,
+ _mm256_shuffle_epi32(ptype, 2), 0x11);
+ rearm67 = _mm256_blend_epi32(rearm67,
+ _mm256_shuffle_epi32(ptype, 3), 0x11);
+
+ /*
+ * Move rss_flags into ol_flags in mbuf_init.
+ * Use 1 shift and 1 blend for each desc: 2 inst/desc
+ */
+ __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
+ rss_flags, 0x44);
+ __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 4), 0x44);
+ __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
+ _mm256_slli_si256(rss_flags, 8), 0x44);
+ __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
+ _mm256_srli_si256(rss_flags, 4), 0x44);
+
+ /*
+ * Build rearm, one per desc.
+ * 8 blends and 4 permutes: 1.5 inst/desc
+ */
+ __m256i rearm0 = _mm256_blend_epi32(rearm01,
+ mbuf_init0_1, 0xf0);
+ __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
+ rearm01, 0xf0);
+ __m256i rearm2 = _mm256_blend_epi32(rearm23,
+ mbuf_init2_3, 0xf0);
+ __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
+ rearm23, 0xf0);
+ /* Swap upper and lower 64 bits */
+ rearm0 = _mm256_permute4x64_epi64(rearm0,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm2 = _mm256_permute4x64_epi64(rearm2,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ /* Second set of 4 descriptors */
+ __m256i rearm4 = _mm256_blend_epi32(rearm45,
+ mbuf_init4_5, 0xf0);
+ __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
+ rearm45, 0xf0);
+ __m256i rearm6 = _mm256_blend_epi32(rearm67,
+ mbuf_init6_7, 0xf0);
+ __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
+ rearm67, 0xf0);
+ rearm4 = _mm256_permute4x64_epi64(rearm4,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+ rearm6 = _mm256_permute4x64_epi64(rearm6,
+ (1 << 6) + (0 << 4) + (3 << 2) + 2);
+
+ /*
+ * Write out 32B of mbuf fields.
+ * data_off - off 0 (mbuf_init)
+ * refcnt - 2 (mbuf_init)
+ * nb_segs - 4 (mbuf_init)
+ * port - 6 (mbuf_init)
+ * ol_flag - 8 (from cqd)
+ * packet_type - 16 (from cqd)
+ * pkt_len - 20 (from cqd)
+ * data_len - 24 (from cqd)
+ * vlan_tci - 26 (from cqd)
+ * rss - 28 (from cqd)
+ */
+ _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
+ _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
+ _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
+ _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
+ _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
+ _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
+ _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
+ _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
+
+ max_rx -= 8;
+ cqd += 8;
+ rx += 8;
+ rxmb += 8;
+ }
+
+ /*
+ * Step 3: Slow path to handle a small (<8) number of packets and
+ * occasional truncated packets.
+ */
+ while (max_rx && ((cqd->type_color &
+ CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
+ if (unlikely(cqd->bytes_written_flags &
+ CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
+ rte_pktmbuf_free(*rxmb++);
+ rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
+ } else {
+ *rx++ = rx_one(cqd, *rxmb++, enic);
+ }
+ cqd++;
+ max_rx--;
+ }
+
+ /* Number of descriptors visited */
+ nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
+ if (nb_rx == 0)
+ return 0;
+ rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
+ rxmb = rq->mbuf_ring + cq_idx;
+ cq_idx += nb_rx;
+ rq->rx_nb_hold += nb_rx;
+ if (unlikely(cq_idx == cq->ring.desc_count)) {
+ cq_idx = 0;
+ cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
+ }
+ cq->to_clean = cq_idx;
+
+ /* Step 4: Restock RQ with new mbufs */
+ memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
+ sizeof(struct rte_mbuf *) * nb_rx);
+ rq->num_free_mbufs -= nb_rx;
+ while (nb_rx) {
+ rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
+ nb_rx--;
+ rqd++;
+ rxmb++;
+ }
+ if (rq->rx_nb_hold > rq->rx_free_thresh) {
+ rq->posted_index = enic_ring_add(rq->ring.desc_count,
+ rq->posted_index,
+ rq->rx_nb_hold);
+ rq->rx_nb_hold = 0;
+ rte_wmb();
+ iowrite32_relaxed(rq->posted_index,
+ &rq->ctrl->posted_index);
+ }
+
+ return rx - rx_pkts;
+}
+
+bool
+enic_use_vector_rx_handler(struct enic *enic)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_fdir_conf *fconf;
+
+ eth_dev = enic->rte_dev;
+ /* User needs to request for the avx2 handler */
+ if (!enic->enable_avx2_rx)
+ return false;
+ /* Do not support scatter Rx */
+ if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
+ return false;
+ /* Do not support fdir/flow */
+ fconf = ð_dev->data->dev_conf.fdir_conf;
+ if (fconf->mode != RTE_FDIR_MODE_NONE)
+ return false;
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+ PMD_INIT_LOG(DEBUG, " use the non-scatter avx2 Rx handler");
+ eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
+ return true;
+ }
+ return false;
+}
diff --git a/drivers/net/enic/meson.build b/drivers/net/enic/meson.build
index bfd4e2373..064487118 100644
--- a/drivers/net/enic/meson.build
+++ b/drivers/net/enic/meson.build
@@ -17,3 +17,19 @@ sources = files(
)
deps += ['hash']
includes += include_directories('base')
+
+# The current implementation assumes 64-bit pointers
+if dpdk_conf.has('RTE_MACHINE_CPUFLAG_AVX2') and cc.sizeof('void *') == 8
+ sources += files('enic_rxtx_vec_avx2.c')
+# Build the avx2 handler if the compiler supports it, even though 'machine'
+# does not. This is to support users who build for the min supported machine
+# and need to run the binary on newer CPUs too.
+# This part is from i40e meson.build
+elif cc.has_argument('-mavx2') and cc.sizeof('void *') == 8
+ enic_avx2_lib = static_library('enic_avx2_lib',
+ 'enic_rxtx_vec_avx2.c',
+ dependencies: [static_rte_ethdev, static_rte_bus_pci],
+ include_directories: includes,
+ c_args: [cflags, '-mavx2'])
+ objs += enic_avx2_lib.extract_objects('enic_rxtx_vec_avx2.c')
+endif
--
2.16.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
@ 2018-10-04 16:15 ` Ferruh Yigit
1 sibling, 0 replies; 11+ messages in thread
From: Ferruh Yigit @ 2018-10-04 16:15 UTC (permalink / raw)
To: John Daley; +Cc: dev, Hyong Youb Kim
On 10/3/2018 9:09 PM, John Daley wrote:
> From: Hyong Youb Kim <hyonkim@cisco.com>
>
> Move a number of Rx functions to the header file so that the avx2
> based Rx handler can use them.
>
> Signed-off-by: Hyong Youb Kim <hyonkim@cisco.com>
> Reviewed-by: John Daley <johndale@cisco.com>
Series applied to dpdk-next-net/master, thanks.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2018-10-04 16:15 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-28 2:16 [dpdk-dev] [PATCH 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-09-28 2:16 ` [dpdk-dev] [PATCH 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-09-28 19:20 ` [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-09-28 19:25 ` [dpdk-dev] [PATCH v2 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-10-02 16:08 ` Ferruh Yigit
2018-10-03 13:00 ` Hyong Youb Kim
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file John Daley
2018-10-03 20:09 ` [dpdk-dev] [PATCH v4 2/2] net/enic: add AVX2 based vectorized Rx handler John Daley
2018-10-04 16:15 ` [dpdk-dev] [PATCH v4 1/2] net/enic: move common Rx functions to a new header file Ferruh Yigit
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).