From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <kananye1@ecsmtp.ir.intel.com>
Received: from mga11.intel.com (mga11.intel.com [192.55.52.93])
 by dpdk.org (Postfix) with ESMTP id CDCFB5A9E
 for <dev@dpdk.org>; Mon, 12 Jan 2015 20:16:39 +0100 (CET)
Received: from orsmga002.jf.intel.com ([10.7.209.21])
 by fmsmga102.fm.intel.com with ESMTP; 12 Jan 2015 11:16:37 -0800
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.07,745,1413270000"; d="scan'208";a="668605495"
Received: from irvmail001.ir.intel.com ([163.33.26.43])
 by orsmga002.jf.intel.com with ESMTP; 12 Jan 2015 11:16:36 -0800
Received: from sivswdev02.ir.intel.com (sivswdev02.ir.intel.com
 [10.237.217.46])
 by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id
 t0CJGa5J008640; Mon, 12 Jan 2015 19:16:36 GMT
Received: from sivswdev02.ir.intel.com (localhost [127.0.0.1])
 by sivswdev02.ir.intel.com with ESMTP id t0CJGZ0t017293;
 Mon, 12 Jan 2015 19:16:35 GMT
Received: (from kananye1@localhost)
 by sivswdev02.ir.intel.com with  id t0CJGZJ7017289;
 Mon, 12 Jan 2015 19:16:35 GMT
From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
Date: Mon, 12 Jan 2015 19:16:17 +0000
Message-Id: <1421090181-17150-14-git-send-email-konstantin.ananyev@intel.com>
X-Mailer: git-send-email 1.7.4.1
In-Reply-To: <1421090181-17150-1-git-send-email-konstantin.ananyev@intel.com>
References: <1421090181-17150-1-git-send-email-konstantin.ananyev@intel.com>
Subject: [dpdk-dev] [PATCH v2 13/17] librte_acl: Remove search_sse_2 and
	relatives.
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: patches and discussions about DPDK <dev.dpdk.org>
List-Unsubscribe: <http://dpdk.org/ml/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <http://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Mon, 12 Jan 2015 19:16:48 -0000

Previous improvements made scalar method the fastest one
for tiny bunch of packets (< 4).
That allows us to remove specific vector code-path for small number of packets
(search_sse_2)
and always use scalar method for such cases.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/librte_acl/acl_run_avx2.c |   2 +-
 lib/librte_acl/acl_run_sse.c  |   3 +-
 lib/librte_acl/acl_run_sse.h  | 110 ------------------------------------------
 3 files changed, 3 insertions(+), 112 deletions(-)

diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c
index 0a42f72..79ebbd6 100644
--- a/lib/librte_acl/acl_run_avx2.c
+++ b/lib/librte_acl/acl_run_avx2.c
@@ -49,6 +49,6 @@ rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	else if (num >= MAX_SEARCHES_SSE4)
 		return search_sse_4(ctx, data, results, num, categories);
 	else
-		return search_sse_2(ctx, data, results, num,
+		return rte_acl_classify_scalar(ctx, data, results, num,
 			categories);
 }
diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
index 77b32b3..a5a7d36 100644
--- a/lib/librte_acl/acl_run_sse.c
+++ b/lib/librte_acl/acl_run_sse.c
@@ -42,5 +42,6 @@ rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	else if (num >= MAX_SEARCHES_SSE4)
 		return search_sse_4(ctx, data, results, num, categories);
 	else
-		return search_sse_2(ctx, data, results, num, categories);
+		return rte_acl_classify_scalar(ctx, data, results, num,
+			categories);
 }
diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h
index e33e16b..1b7870e 100644
--- a/lib/librte_acl/acl_run_sse.h
+++ b/lib/librte_acl/acl_run_sse.h
@@ -45,10 +45,6 @@ static const rte_xmm_t xmm_shuffle_input = {
 	.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
 };
 
-static const rte_xmm_t xmm_shuffle_input64 = {
-	.u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},
-};
-
 static const rte_xmm_t xmm_ones_16 = {
 	.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
 };
@@ -62,15 +58,6 @@ static const rte_xmm_t xmm_match_mask = {
 	},
 };
 
-static const rte_xmm_t xmm_match_mask64 = {
-	.u32 = {
-		RTE_ACL_NODE_MATCH,
-		0,
-		RTE_ACL_NODE_MATCH,
-		0,
-	},
-};
-
 static const rte_xmm_t xmm_index_mask = {
 	.u32 = {
 		RTE_ACL_NODE_INDEX,
@@ -80,16 +67,6 @@ static const rte_xmm_t xmm_index_mask = {
 	},
 };
 
-static const rte_xmm_t xmm_index_mask64 = {
-	.u32 = {
-		RTE_ACL_NODE_INDEX,
-		RTE_ACL_NODE_INDEX,
-		0,
-		0,
-	},
-};
-
-
 /*
  * Resolve priority for multiple results (sse version).
  * This consists comparing the priority of the current traversal with the
@@ -161,22 +138,6 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,
 }
 
 /*
- * Check for a match in 2 transitions (contained in SSE register)
- */
-static inline __attribute__((always_inline)) void
-acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
-	struct acl_flow_data *flows, xmm_t *indices, xmm_t match_mask)
-{
-	xmm_t temp;
-
-	temp = MM_AND(match_mask, *indices);
-	while (!MM_TESTZ(temp, temp)) {
-		acl_process_matches(indices, slot, ctx, parms, flows);
-		temp = MM_AND(match_mask, *indices);
-	}
-}
-
-/*
  * Check for any match in 4 transitions (contained in 2 SSE registers)
  */
 static inline __attribute__((always_inline)) void
@@ -460,74 +421,3 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	return 0;
 }
-
-static inline __attribute__((always_inline)) xmm_t
-transition2(xmm_t next_input, const uint64_t *trans, xmm_t *indices1)
-{
-	uint64_t t;
-	xmm_t addr, indices2;
-
-	indices2 = _mm_setzero_si128();
-
-	addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,
-		xmm_ones_16.x, *indices1, indices2);
-
-	/* Gather 64 bit transitions and pack 2 per register. */
-
-	t = trans[MM_CVT32(addr)];
-
-	/* get slot 1 */
-	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
-	*indices1 = MM_SET64(trans[MM_CVT32(addr)], t);
-
-	return MM_SRL32(next_input, CHAR_BIT);
-}
-
-/*
- * Execute trie traversal with 2 traversals in parallel.
- */
-static inline int
-search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
-	uint32_t *results, uint32_t total_packets, uint32_t categories)
-{
-	int n;
-	struct acl_flow_data flows;
-	uint64_t index_array[MAX_SEARCHES_SSE2];
-	struct completion cmplt[MAX_SEARCHES_SSE2];
-	struct parms parms[MAX_SEARCHES_SSE2];
-	xmm_t input, indices;
-
-	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
-		total_packets, categories, ctx->trans_table);
-
-	for (n = 0; n < MAX_SEARCHES_SSE2; n++) {
-		cmplt[n].count = 0;
-		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
-	}
-
-	indices = MM_LOADU((xmm_t *) &index_array[0]);
-
-	/* Check for any matches. */
-	acl_match_check_x2(0, ctx, parms, &flows, &indices,
-		xmm_match_mask64.x);
-
-	while (flows.started > 0) {
-
-		/* Gather 4 bytes of input data for each stream. */
-		input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
-		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
-
-		/* Process the 4 bytes of input on each stream. */
-
-		input = transition2(input, flows.trans, &indices);
-		input = transition2(input, flows.trans, &indices);
-		input = transition2(input, flows.trans, &indices);
-		input = transition2(input, flows.trans, &indices);
-
-		/* Check for any matches. */
-		acl_match_check_x2(0, ctx, parms, &flows, &indices,
-			xmm_match_mask64.x);
-	}
-
-	return 0;
-}
-- 
1.8.5.3