DPDK patches and discussions
 help / color / mirror / Atom feed
From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH v2 06/17] librte_acl: introduce DFA nodes compression (group64) for identical entries.
Date: Mon, 12 Jan 2015 19:16:10 +0000	[thread overview]
Message-ID: <1421090181-17150-7-git-send-email-konstantin.ananyev@intel.com> (raw)
In-Reply-To: <1421090181-17150-1-git-send-email-konstantin.ananyev@intel.com>

Introduced division of whole 256 child transition enties
into 4 sub-groups (64 kids per group).
So 2 groups within the same node with identical children,
can use one set of transition entries.
That allows to compact some DFA nodes and get space savings in the RT table,
without any negative performance impact.
>From what I've seen an average space savings: ~20%.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/librte_acl/acl.h            |  12 ++-
 lib/librte_acl/acl_gen.c        | 195 ++++++++++++++++++++++++++++------------
 lib/librte_acl/acl_run_scalar.c |  38 ++++----
 lib/librte_acl/acl_run_sse.c    |  99 ++++++--------------
 4 files changed, 196 insertions(+), 148 deletions(-)

diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 102fa51..3f6ac79 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -47,6 +47,11 @@ extern"C" {
 #define RTE_ACL_DFA_MAX		UINT8_MAX
 #define RTE_ACL_DFA_SIZE	(UINT8_MAX + 1)
 
+#define	RTE_ACL_DFA_GR64_SIZE	64
+#define	RTE_ACL_DFA_GR64_NUM	(RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE)
+#define	RTE_ACL_DFA_GR64_BIT	\
+	(CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM)
+
 typedef int bits_t;
 
 #define	RTE_ACL_BIT_SET_SIZE	((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT))
@@ -100,8 +105,11 @@ struct rte_acl_node {
 	/* number of ranges (transitions w/ consecutive bits) */
 	int32_t                 id;
 	struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */
-	char                         transitions[RTE_ACL_QUAD_SIZE];
-	/* boundaries for ranged node */
+	union {
+		char            transitions[RTE_ACL_QUAD_SIZE];
+		/* boundaries for ranged node */
+		uint8_t         dfa_gr64[RTE_ACL_DFA_GR64_NUM];
+	};
 	struct rte_acl_node     *next;
 	/* free list link or pointer to duplicate node during merge */
 	struct rte_acl_node     *prev;
diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c
index b1f766b..c9b7839 100644
--- a/lib/librte_acl/acl_gen.c
+++ b/lib/librte_acl/acl_gen.c
@@ -43,13 +43,14 @@
 } while (0)
 
 struct acl_node_counters {
-	int                match;
-	int                match_used;
-	int                single;
-	int                quad;
-	int                quad_vectors;
-	int                dfa;
-	int                smallest_match;
+	int32_t match;
+	int32_t match_used;
+	int32_t single;
+	int32_t quad;
+	int32_t quad_vectors;
+	int32_t dfa;
+	int32_t dfa_gr64;
+	int32_t smallest_match;
 };
 
 struct rte_acl_indices {
@@ -61,24 +62,118 @@ struct rte_acl_indices {
 
 static void
 acl_gen_log_stats(const struct rte_acl_ctx *ctx,
-	const struct acl_node_counters *counts)
+	const struct acl_node_counters *counts,
+	const struct rte_acl_indices *indices)
 {
 	RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
 		"runtime memory footprint on socket %d:\n"
 		"single nodes/bytes used: %d/%zu\n"
-		"quad nodes/bytes used: %d/%zu\n"
-		"DFA nodes/bytes used: %d/%zu\n"
+		"quad nodes/vectors/bytes used: %d/%d/%zu\n"
+		"DFA nodes/group64/bytes used: %d/%d/%zu\n"
 		"match nodes/bytes used: %d/%zu\n"
 		"total: %zu bytes\n",
 		ctx->name, ctx->socket_id,
 		counts->single, counts->single * sizeof(uint64_t),
-		counts->quad, counts->quad_vectors * sizeof(uint64_t),
-		counts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t),
+		counts->quad, counts->quad_vectors,
+		(indices->quad_index - indices->dfa_index) * sizeof(uint64_t),
+		counts->dfa, counts->dfa_gr64,
+		indices->dfa_index * sizeof(uint64_t),
 		counts->match,
 		counts->match * sizeof(struct rte_acl_match_results),
 		ctx->mem_sz);
 }
 
+static uint64_t
+acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)
+{
+	uint64_t idx;
+	uint32_t i;
+
+	idx = 0;
+	for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+		RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);
+		RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);
+		idx |= (i - node->dfa_gr64[i]) <<
+			(6 + RTE_ACL_DFA_GR64_BIT * i);
+	}
+
+	return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;
+}
+
+static void
+acl_dfa_fill_gr64(const struct rte_acl_node *node,
+	const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+		memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,
+			src + i * RTE_ACL_DFA_GR64_SIZE,
+			RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0]));
+	}
+}
+
+static uint32_t
+acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE],
+	uint8_t gr64[RTE_ACL_DFA_GR64_NUM])
+{
+	uint32_t i, j, k;
+
+	k = 0;
+	for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) {
+		gr64[i] = i;
+		for (j = 0; j != i; j++) {
+			if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE,
+					array_ptr + j * RTE_ACL_DFA_GR64_SIZE,
+					RTE_ACL_DFA_GR64_SIZE *
+					sizeof(array_ptr[0])) == 0)
+				break;
+		}
+		gr64[i] = (j != i) ? gr64[j] : k++;
+	}
+
+	return k;
+}
+
+static uint32_t
+acl_node_fill_dfa(const struct rte_acl_node *node,
+	uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved)
+{
+	uint32_t n, x;
+	uint32_t ranges, last_bit;
+	struct rte_acl_node *child;
+	struct rte_acl_bitset *bits;
+
+	ranges = 0;
+	last_bit = 0;
+
+	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+		dfa[n] = no_match;
+
+	for (x = 0; x < node->num_ptrs; x++) {
+
+		child = node->ptrs[x].ptr;
+		if (child == NULL)
+			continue;
+
+		bits = &node->ptrs[x].values;
+		for (n = 0; n < RTE_ACL_DFA_SIZE; n++) {
+
+			if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
+				(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
+
+				dfa[n] = resolved ? child->node_index : x;
+				ranges += (last_bit == 0);
+				last_bit = 1;
+			} else {
+				last_bit = 0;
+			}
+		}
+	}
+
+	return ranges;
+}
+
 /*
 *  Counts the number of groups of sequential bits that are
 *  either 0 or 1, as specified by the zero_one parameter. This is used to
@@ -150,10 +245,11 @@ acl_count_fanout(struct rte_acl_node *node)
  */
 static int
 acl_count_trie_types(struct acl_node_counters *counts,
-	struct rte_acl_node *node, int match, int force_dfa)
+	struct rte_acl_node *node, uint64_t no_match, int match, int force_dfa)
 {
 	uint32_t n;
 	int num_ptrs;
+	uint64_t dfa[RTE_ACL_DFA_SIZE];
 
 	/* skip if this node has been counted */
 	if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
@@ -186,6 +282,16 @@ acl_count_trie_types(struct acl_node_counters *counts,
 	} else {
 		counts->dfa++;
 		node->node_type = RTE_ACL_NODE_DFA;
+		if (force_dfa != 0) {
+			/* always expand to a max number of nodes. */
+			for (n = 0; n != RTE_DIM(node->dfa_gr64); n++)
+				node->dfa_gr64[n] = n;
+			node->fanout = n;
+		} else {
+			acl_node_fill_dfa(node, dfa, no_match, 0);
+			node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64);
+		}
+		counts->dfa_gr64 += node->fanout;
 	}
 
 	/*
@@ -194,7 +300,7 @@ acl_count_trie_types(struct acl_node_counters *counts,
 	for (n = 0; n < node->num_ptrs; n++) {
 		if (node->ptrs[n].ptr != NULL)
 			match = acl_count_trie_types(counts, node->ptrs[n].ptr,
-				match, 0);
+				no_match, match, 0);
 	}
 
 	return match;
@@ -204,38 +310,11 @@ static void
 acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
 	int resolved)
 {
-	uint32_t n, x;
-	int m, ranges, last_bit;
-	struct rte_acl_node *child;
-	struct rte_acl_bitset *bits;
+	uint32_t x;
+	int32_t m;
 	uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];
 
-	ranges = 0;
-	last_bit = 0;
-
-	for (n = 0; n < RTE_DIM(dfa); n++)
-		dfa[n] = no_match;
-
-	for (x = 0; x < node->num_ptrs; x++) {
-
-		child = node->ptrs[x].ptr;
-		if (child == NULL)
-			continue;
-
-		bits = &node->ptrs[x].values;
-		for (n = 0; n < RTE_DIM(dfa); n++) {
-
-			if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
-				(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
-
-				dfa[n] = resolved ? child->node_index : x;
-				ranges += (last_bit == 0);
-				last_bit = 1;
-			} else {
-				last_bit = 0;
-			}
-		}
-	}
+	acl_node_fill_dfa(node, dfa, no_match, resolved);
 
 	/*
 	 * Rather than going from 0 to 256, the range count and
@@ -272,8 +351,7 @@ acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
 		RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);
 
 	} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {
-		for (n = 0; n < RTE_DIM(dfa); n++)
-			node_array[n] = dfa[n];
+		acl_dfa_fill_gr64(node, dfa, node_array);
 	}
 }
 
@@ -286,7 +364,7 @@ static void
 acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
 	uint64_t no_match, struct rte_acl_indices *index, int num_categories)
 {
-	uint32_t n, *qtrp;
+	uint32_t n, sz, *qtrp;
 	uint64_t *array_ptr;
 	struct rte_acl_match_results *match;
 
@@ -297,10 +375,11 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
 
 	switch (node->node_type) {
 	case RTE_ACL_NODE_DFA:
-		node->node_index = index->dfa_index | node->node_type;
 		array_ptr = &node_array[index->dfa_index];
-		index->dfa_index += RTE_ACL_DFA_SIZE;
-		for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+		node->node_index = acl_dfa_gen_idx(node, index->dfa_index);
+		sz = node->fanout * RTE_ACL_DFA_GR64_SIZE;
+		index->dfa_index += sz;
+		for (n = 0; n < sz; n++)
 			array_ptr[n] = no_match;
 		break;
 	case RTE_ACL_NODE_SINGLE:
@@ -312,7 +391,7 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
 		break;
 	case RTE_ACL_NODE_QRANGE:
 		array_ptr = &node_array[index->quad_index];
-		acl_add_ptrs(node, array_ptr, no_match,  0);
+		acl_add_ptrs(node, array_ptr, no_match, 0);
 		qtrp = (uint32_t *)node->transitions;
 		node->node_index = qtrp[0];
 		node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
@@ -368,7 +447,7 @@ static int
 acl_calc_counts_indices(struct acl_node_counters *counts,
 	struct rte_acl_indices *indices, struct rte_acl_trie *trie,
 	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
-	int match_num)
+	int match_num, uint64_t no_match)
 {
 	uint32_t n;
 
@@ -379,13 +458,13 @@ acl_calc_counts_indices(struct acl_node_counters *counts,
 	for (n = 0; n < num_tries; n++) {
 		counts->smallest_match = INT32_MAX;
 		match_num = acl_count_trie_types(counts, node_bld_trie[n].trie,
-			match_num, 1);
+			no_match, match_num, 1);
 		trie[n].smallest = counts->smallest_match;
 	}
 
 	indices->dfa_index = RTE_ACL_DFA_SIZE + 1;
 	indices->quad_index = indices->dfa_index +
-		counts->dfa * RTE_ACL_DFA_SIZE;
+		counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE;
 	indices->single_index = indices->quad_index + counts->quad_vectors;
 	indices->match_index = indices->single_index + counts->single + 1;
 	indices->match_index = RTE_ALIGN(indices->match_index,
@@ -410,9 +489,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
 	struct acl_node_counters counts;
 	struct rte_acl_indices indices;
 
+	no_match = RTE_ACL_NODE_MATCH;
+
 	/* Fill counts and indices arrays from the nodes. */
 	match_num = acl_calc_counts_indices(&counts, &indices, trie,
-		node_bld_trie, num_tries, match_num);
+		node_bld_trie, num_tries, match_num, no_match);
 
 	/* Allocate runtime memory (align to cache boundary) */
 	total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) +
@@ -440,11 +521,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
 	 */
 
 	node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;
-	no_match = RTE_ACL_NODE_MATCH;
 
 	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
 		node_array[n] = no_match;
 
+	/* NOMATCH result at index 0 */
 	match = ((struct rte_acl_match_results *)(node_array + match_index));
 	memset(match, 0, sizeof(*match));
 
@@ -470,6 +551,6 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
 	ctx->trans_table = node_array;
 	memcpy(ctx->trie, trie, sizeof(ctx->trie));
 
-	acl_gen_log_stats(ctx, &counts);
+	acl_gen_log_stats(ctx, &counts, &indices);
 	return 0;
 }
diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c
index 43c8fc3..40691ce 100644
--- a/lib/librte_acl/acl_run_scalar.c
+++ b/lib/librte_acl/acl_run_scalar.c
@@ -94,15 +94,6 @@ resolve_priority_scalar(uint64_t transition, int n,
 	}
 }
 
-/*
- * When processing the transition, rather than using if/else
- * construct, the offset is calculated for DFA and QRANGE and
- * then conditionally added to the address based on node type.
- * This is done to avoid branch mis-predictions. Since the
- * offset is rather simple calculation it is more efficient
- * to do the calculation and do a condition move rather than
- * a conditional branch to determine which calculation to do.
- */
 static inline uint32_t
 scan_forward(uint32_t input, uint32_t max)
 {
@@ -117,18 +108,27 @@ scalar_transition(const uint64_t *trans_table, uint64_t transition,
 
 	/* break transition into component parts */
 	ranges = transition >> (sizeof(index) * CHAR_BIT);
-
-	/* calc address for a QRANGE node */
-	c = input * SCALAR_QRANGE_MULT;
-	a = ranges | SCALAR_QRANGE_MIN;
 	index = transition & ~RTE_ACL_NODE_INDEX;
-	a -= (c & SCALAR_QRANGE_MASK);
-	b = c & SCALAR_QRANGE_MIN;
 	addr = transition ^ index;
-	a &= SCALAR_QRANGE_MIN;
-	a ^= (ranges ^ b) & (a ^ b);
-	x = scan_forward(a, 32) >> 3;
-	addr += (index == RTE_ACL_NODE_DFA) ? input : x;
+
+	if (index != RTE_ACL_NODE_DFA) {
+		/* calc address for a QRANGE/SINGLE node */
+		c = (uint32_t)input * SCALAR_QRANGE_MULT;
+		a = ranges | SCALAR_QRANGE_MIN;
+		a -= (c & SCALAR_QRANGE_MASK);
+		b = c & SCALAR_QRANGE_MIN;
+		a &= SCALAR_QRANGE_MIN;
+		a ^= (ranges ^ b) & (a ^ b);
+		x = scan_forward(a, 32) >> 3;
+	} else {
+		/* calc address for a DFA node */
+		x = ranges >> (input /
+			RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT);
+		x &= UINT8_MAX;
+		x = input - x;
+	}
+
+	addr += x;
 
 	/* pickup next transition */
 	transition = *(trans_table + addr);
diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
index 69a9d77..576c92b 100644
--- a/lib/librte_acl/acl_run_sse.c
+++ b/lib/librte_acl/acl_run_sse.c
@@ -40,24 +40,6 @@ enum {
 	SHUFFLE32_SWAP64 = 0x4e,
 };
 
-static const rte_xmm_t mm_type_quad_range = {
-	.u32 = {
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-	},
-};
-
-static const rte_xmm_t mm_type_quad_range64 = {
-	.u32 = {
-		RTE_ACL_NODE_QRANGE,
-		RTE_ACL_NODE_QRANGE,
-		0,
-		0,
-	},
-};
-
 static const rte_xmm_t mm_shuffle_input = {
 	.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
 };
@@ -70,14 +52,6 @@ static const rte_xmm_t mm_ones_16 = {
 	.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
 };
 
-static const rte_xmm_t mm_bytes = {
-	.u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
-};
-
-static const rte_xmm_t mm_bytes64 = {
-	.u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
-};
-
 static const rte_xmm_t mm_match_mask = {
 	.u32 = {
 		RTE_ACL_NODE_MATCH,
@@ -236,10 +210,14 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
  */
 static inline xmm_t
 acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
-	xmm_t *indices1, xmm_t *indices2)
+	xmm_t ones_16, xmm_t indices1, xmm_t indices2)
 {
-	xmm_t addr, node_types, temp;
+	xmm_t addr, node_types, range, temp;
+	xmm_t dfa_msk, dfa_ofs, quad_ofs;
+	xmm_t in, r, t;
+
+	const xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,
+		0xffffff04, 0xffffff00);
 
 	/*
 	 * Note that no transition is done for a match
@@ -248,10 +226,13 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 */
 
 	/* Shuffle low 32 into temp and high 32 into indices2 */
-	temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,
-		0x88);
-	*indices2 = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,
-		(__m128)*indices2, 0xdd);
+	temp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);
+	range = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);
+
+	t = MM_XOR(index_mask, index_mask);
+
+	/* shuffle input byte to all 4 positions of 32 bit value */
+	in = MM_SHUFFLE8(next_input, shuffle_input);
 
 	/* Calc node type and node addr */
 	node_types = MM_ANDNOT(index_mask, temp);
@@ -262,17 +243,15 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 */
 
 	/* mask for DFA type (0) nodes */
-	temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));
+	dfa_msk = MM_CMPEQ32(node_types, t);
 
-	/* add input byte to DFA position */
-	temp = MM_AND(temp, bytes);
-	temp = MM_AND(temp, next_input);
-	addr = MM_ADD32(addr, temp);
+	r = _mm_srli_epi32(in, 30);
+	r = _mm_add_epi8(r, range_base);
 
-	/*
-	 * Calc addr for Range nodes -> range_index + range(input)
-	 */
-	node_types = MM_CMPEQ32(node_types, type_quad_range);
+	t = _mm_srli_epi32(in, 24);
+	r = _mm_shuffle_epi8(range, r);
+
+	dfa_ofs = _mm_sub_epi32(t, r);
 
 	/*
 	 * Calculate number of range boundaries that are less than the
@@ -282,11 +261,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 * input byte.
 	 */
 
-	/* shuffle input byte to all 4 positions of 32 bit value */
-	temp = MM_SHUFFLE8(next_input, shuffle_input);
-
 	/* check ranges */
-	temp = MM_CMPGT8(temp, *indices2);
+	temp = MM_CMPGT8(in, range);
 
 	/* convert -1 to 1 (bytes greater than input byte */
 	temp = MM_SIGN8(temp, temp);
@@ -295,10 +271,10 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	temp = MM_MADD8(temp, temp);
 
 	/* horizontal add pairs of words into dwords */
-	temp = MM_MADD16(temp, ones_16);
+	quad_ofs = MM_MADD16(temp, ones_16);
 
 	/* mask to range type nodes */
-	temp = MM_AND(temp, node_types);
+	temp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
 
 	/* add index into node position */
 	return MM_ADD32(addr, temp);
@@ -309,8 +285,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
  */
 static inline xmm_t
 transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
-	const uint64_t *trans, xmm_t *indices1, xmm_t *indices2)
+	xmm_t ones_16, const uint64_t *trans,
+	xmm_t *indices1, xmm_t *indices2)
 {
 	xmm_t addr;
 	uint64_t trans0, trans2;
@@ -318,7 +294,7 @@ transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	 /* Calculate the address (array index) for all 4 transitions. */
 
 	addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
-		bytes, type_quad_range, indices1, indices2);
+		*indices1, *indices2);
 
 	 /* Gather 64 bit transitions and pack back into 2 registers. */
 
@@ -408,42 +384,34 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		input0 = transition4(mm_index_mask.m, input0,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		input1 = transition4(mm_index_mask.m, input1,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices3, &indices4);
 
 		 /* Check for any matches. */
@@ -496,22 +464,18 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		/* Process the 4 bytes of input on each stream. */
 		input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		 input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		 input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		 input = transition4(mm_index_mask.m, input,
 			mm_shuffle_input.m, mm_ones_16.m,
-			mm_bytes.m, mm_type_quad_range.m,
 			flows.trans, &indices1, &indices2);
 
 		/* Check for any matches. */
@@ -524,8 +488,7 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 static inline xmm_t
 transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
-	const uint64_t *trans, xmm_t *indices1)
+	xmm_t ones_16, const uint64_t *trans, xmm_t *indices1)
 {
 	uint64_t t;
 	xmm_t addr, indices2;
@@ -533,7 +496,7 @@ transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 	indices2 = MM_XOR(ones_16, ones_16);
 
 	addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
-		bytes, type_quad_range, indices1, &indices2);
+		*indices1, indices2);
 
 	/* Gather 64 bit transitions and pack 2 per register. */
 
@@ -583,22 +546,18 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		input = transition2(mm_index_mask64.m, input,
 			mm_shuffle_input64.m, mm_ones_16.m,
-			mm_bytes64.m, mm_type_quad_range64.m,
 			flows.trans, &indices);
 
 		/* Check for any matches. */
-- 
1.8.5.3

  parent reply	other threads:[~2015-01-12 19:16 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-12 19:16 [dpdk-dev] [PATCH v2 00/17] ACL: New AVX2 classify method and several other enhancements Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 01/17] fix fix compilation issues with RTE_LIBRTE_ACL_STANDALONE=y Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 02/17] app/test: few small fixes fot test_acl.c Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 03/17] librte_acl: make data_indexes long enough to survive idle transitions Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 04/17] librte_acl: remove build phase heuristsic with negative perfomance effect Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 05/17] librte_acl: fix a bug at build phase that can cause matches beeing overwirtten Konstantin Ananyev
2015-01-12 19:16 ` Konstantin Ananyev [this message]
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 07/17] librte_acl: build/gen phase - simplify the way match nodes are allocated Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 08/17] librte_acl: make scalar RT code to be more similar to vector one Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 09/17] librte_acl: a bit of RT code deduplication Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 10/17] EAL: introduce rte_ymm and relatives in rte_common_vect.h Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 11/17] librte_acl: add AVX2 as new rte_acl_classify() method Konstantin Ananyev
2015-01-19 17:22   ` Thomas Monjalon
2015-01-20 10:56     ` Ananyev, Konstantin
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 12/17] test-acl: add ability to manually select RT method Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 13/17] librte_acl: Remove search_sse_2 and relatives Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 14/17] libter_acl: move lo/hi dwords shuffle out from calc_addr Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 15/17] libte_acl: make calc_addr a define to deduplicate the code Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 16/17] libte_acl: introduce max_size into rte_acl_config Konstantin Ananyev
2015-01-12 19:16 ` [dpdk-dev] [PATCH v2 17/17] libte_acl: remove unused macros Konstantin Ananyev
2015-01-19 17:17   ` Thomas Monjalon
2015-01-20 10:09     ` Ananyev, Konstantin
2015-01-20 10:48       ` Jim Thompson
     [not found]         ` <2601191342CEEE43887BDE71AB977258213DE0BB@irsmsx105.ger.corp.intel.com>
2015-01-20 11:11           ` Ananyev, Konstantin
2015-01-20 12:26       ` Thomas Monjalon
2015-01-14 18:39 ` [dpdk-dev] [PATCH v2 00/17] ACL: New AVX2 classify method and several other enhancements Neil Horman
2015-01-19 17:16   ` Thomas Monjalon
2015-01-19 18:39     ` Neil Horman
2015-01-20 10:11     ` Ananyev, Konstantin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1421090181-17150-7-git-send-email-konstantin.ananyev@intel.com \
    --to=konstantin.ananyev@intel.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).