DPDK patches and discussions
 help / color / mirror / Atom feed
From: Neil Horman <nhorman@tuxdriver.com>
To: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: dev@dpdk.org
Subject: Re: [dpdk-dev] [PATCH 09/17] EAL: introduce rte_ymm and relatives in rte_common_vect.h.
Date: Mon, 15 Dec 2014 10:56:30 -0500	[thread overview]
Message-ID: <20141215155630.GB3803@hmsreliant.think-freely.org> (raw)
In-Reply-To: <1418580659-12595-10-git-send-email-konstantin.ananyev@intel.com>

On Sun, Dec 14, 2014 at 06:10:51PM +0000, Konstantin Ananyev wrote:
> New data type to manipulate 256 bit AVX values.
> Rename field in the rte_xmm to keep common naming accross SSE/AVX fields.
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
>  examples/l3fwd/main.c                           |  2 +-
>  lib/librte_acl/acl_run_sse.c                    | 88 ++++++++++++-------------
>  lib/librte_eal/common/include/rte_common_vect.h | 27 +++++++-
>  lib/librte_lpm/rte_lpm.h                        |  2 +-
>  4 files changed, 71 insertions(+), 48 deletions(-)
> 
> diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
> index bf0fcdb..dc6cae2 100644
> --- a/examples/l3fwd/main.c
> +++ b/examples/l3fwd/main.c
> @@ -1168,7 +1168,7 @@ processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag,
>  	if (likely(flag != 0)) {
>  		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid);
>  	} else {
> -		dst.m = dip;
> +		dst.x = dip;
>  		dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid);
>  		dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid);
>  		dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid);
> diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
> index 09e32be..4605b58 100644
> --- a/lib/librte_acl/acl_run_sse.c
> +++ b/lib/librte_acl/acl_run_sse.c
> @@ -359,16 +359,16 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
>  
>  	 /* Check for any matches. */
>  	acl_match_check_x4(0, ctx, parms, &flows,
> -		&indices1, &indices2, mm_match_mask.m);
> +		&indices1, &indices2, mm_match_mask.x);
>  	acl_match_check_x4(4, ctx, parms, &flows,
> -		&indices3, &indices4, mm_match_mask.m);
> +		&indices3, &indices4, mm_match_mask.x);
>  
>  	while (flows.started > 0) {
>  
>  		/* Gather 4 bytes of input data for each stream. */
> -		input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
> +		input0 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0),
>  			0);
> -		input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
> +		input1 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 4),
>  			0);
>  
>  		input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
> @@ -382,43 +382,43 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
>  
>  		 /* Process the 4 bytes of input on each stream. */
>  
> -		input0 = transition4(mm_index_mask.m, input0,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input0 = transition4(mm_index_mask.x, input0,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
> -		input1 = transition4(mm_index_mask.m, input1,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input1 = transition4(mm_index_mask.x, input1,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices3, &indices4);
>  
> -		input0 = transition4(mm_index_mask.m, input0,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input0 = transition4(mm_index_mask.x, input0,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
> -		input1 = transition4(mm_index_mask.m, input1,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input1 = transition4(mm_index_mask.x, input1,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices3, &indices4);
>  
> -		input0 = transition4(mm_index_mask.m, input0,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input0 = transition4(mm_index_mask.x, input0,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
> -		input1 = transition4(mm_index_mask.m, input1,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input1 = transition4(mm_index_mask.x, input1,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices3, &indices4);
>  
> -		input0 = transition4(mm_index_mask.m, input0,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input0 = transition4(mm_index_mask.x, input0,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
> -		input1 = transition4(mm_index_mask.m, input1,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input1 = transition4(mm_index_mask.x, input1,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices3, &indices4);
>  
>  		 /* Check for any matches. */
>  		acl_match_check_x4(0, ctx, parms, &flows,
> -			&indices1, &indices2, mm_match_mask.m);
> +			&indices1, &indices2, mm_match_mask.x);
>  		acl_match_check_x4(4, ctx, parms, &flows,
> -			&indices3, &indices4, mm_match_mask.m);
> +			&indices3, &indices4, mm_match_mask.x);
>  	}
>  
>  	return 0;
> @@ -451,36 +451,36 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
>  
>  	/* Check for any matches. */
>  	acl_match_check_x4(0, ctx, parms, &flows,
> -		&indices1, &indices2, mm_match_mask.m);
> +		&indices1, &indices2, mm_match_mask.x);
>  
>  	while (flows.started > 0) {
>  
>  		/* Gather 4 bytes of input data for each stream. */
> -		input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
> +		input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
>  		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
>  		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
>  		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
>  
>  		/* Process the 4 bytes of input on each stream. */
> -		input = transition4(mm_index_mask.m, input,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		input = transition4(mm_index_mask.x, input,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
> -		 input = transition4(mm_index_mask.m, input,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		 input = transition4(mm_index_mask.x, input,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
> -		 input = transition4(mm_index_mask.m, input,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		 input = transition4(mm_index_mask.x, input,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
> -		 input = transition4(mm_index_mask.m, input,
> -			mm_shuffle_input.m, mm_ones_16.m,
> +		 input = transition4(mm_index_mask.x, input,
> +			mm_shuffle_input.x, mm_ones_16.x,
>  			flows.trans, &indices1, &indices2);
>  
>  		/* Check for any matches. */
>  		acl_match_check_x4(0, ctx, parms, &flows,
> -			&indices1, &indices2, mm_match_mask.m);
> +			&indices1, &indices2, mm_match_mask.x);
>  	}
>  
>  	return 0;
> @@ -534,35 +534,35 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
>  	indices = MM_LOADU((xmm_t *) &index_array[0]);
>  
>  	/* Check for any matches. */
> -	acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.m);
> +	acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.x);
>  
>  	while (flows.started > 0) {
>  
>  		/* Gather 4 bytes of input data for each stream. */
> -		input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
> +		input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
>  		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
>  
>  		/* Process the 4 bytes of input on each stream. */
>  
> -		input = transition2(mm_index_mask64.m, input,
> -			mm_shuffle_input64.m, mm_ones_16.m,
> +		input = transition2(mm_index_mask64.x, input,
> +			mm_shuffle_input64.x, mm_ones_16.x,
>  			flows.trans, &indices);
>  
> -		input = transition2(mm_index_mask64.m, input,
> -			mm_shuffle_input64.m, mm_ones_16.m,
> +		input = transition2(mm_index_mask64.x, input,
> +			mm_shuffle_input64.x, mm_ones_16.x,
>  			flows.trans, &indices);
>  
> -		input = transition2(mm_index_mask64.m, input,
> -			mm_shuffle_input64.m, mm_ones_16.m,
> +		input = transition2(mm_index_mask64.x, input,
> +			mm_shuffle_input64.x, mm_ones_16.x,
>  			flows.trans, &indices);
>  
> -		input = transition2(mm_index_mask64.m, input,
> -			mm_shuffle_input64.m, mm_ones_16.m,
> +		input = transition2(mm_index_mask64.x, input,
> +			mm_shuffle_input64.x, mm_ones_16.x,
>  			flows.trans, &indices);
>  
>  		/* Check for any matches. */
>  		acl_match_check_x2(0, ctx, parms, &flows, &indices,
> -			mm_match_mask64.m);
> +			mm_match_mask64.x);
>  	}
>  
>  	return 0;
> diff --git a/lib/librte_eal/common/include/rte_common_vect.h b/lib/librte_eal/common/include/rte_common_vect.h
> index 95bf4b1..617470b 100644
> --- a/lib/librte_eal/common/include/rte_common_vect.h
> +++ b/lib/librte_eal/common/include/rte_common_vect.h
> @@ -54,6 +54,10 @@
>  #include <smmintrin.h>
>  #endif
>  
> +#if defined(__AVX__)
> +#include <immintrin.h>
> +#endif
> +
>  #else
>  
>  #include <x86intrin.h>
> @@ -70,7 +74,7 @@ typedef __m128i xmm_t;
>  #define	XMM_MASK	(XMM_SIZE - 1)
>  
>  typedef union rte_xmm {
> -	xmm_t    m;
> +	xmm_t    x;
>  	uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
>  	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
>  	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
> @@ -78,10 +82,29 @@ typedef union rte_xmm {
>  	double   pd[XMM_SIZE / sizeof(double)];
>  } rte_xmm_t;
>  
> +#ifdef __AVX__
> +
Why are you excluding this type based on instruction availability.  I don't see
anything in the definition that makes any of the types included dependent on AVX
availability.

> +typedef __m256i ymm_t;
> +
> +#define	YMM_SIZE	(sizeof(ymm_t))
> +#define	YMM_MASK	(YMM_SIZE - 1)
> +
> +typedef union rte_ymm {
> +	ymm_t    y;
> +	xmm_t    x[YMM_SIZE / sizeof(xmm_t)];
> +	uint8_t  u8[YMM_SIZE / sizeof(uint8_t)];
> +	uint16_t u16[YMM_SIZE / sizeof(uint16_t)];
> +	uint32_t u32[YMM_SIZE / sizeof(uint32_t)];
> +	uint64_t u64[YMM_SIZE / sizeof(uint64_t)];
> +	double   pd[YMM_SIZE / sizeof(double)];
> +} rte_ymm_t;
> +
> +#endif /* __AVX__ */

  reply	other threads:[~2014-12-15 15:56 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-12-14 18:10 [dpdk-dev] [PATCH 00/17] ACL: New AVX2 classify method and several other enhancements Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 01/17] app/test: few small fixes fot test_acl.c Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 02/17] librte_acl: make data_indexes long enough to survive idle transitions Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 03/17] librte_acl: remove build phase heuristsic with negative perfomance effect Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 04/17] librte_acl: fix a bug at build phase that can cause matches beeing overwirtten Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 05/17] librte_acl: introduce DFA nodes compression (group64) for identical entries Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 06/17] librte_acl: build/gen phase - simplify the way match nodes are allocated Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 07/17] librte_acl: make scalar RT code to be more similar to vector one Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 08/17] librte_acl: a bit of RT code deduplication Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 09/17] EAL: introduce rte_ymm and relatives in rte_common_vect.h Konstantin Ananyev
2014-12-15 15:56   ` Neil Horman [this message]
2014-12-14 18:10 ` [dpdk-dev] [PATCH 10/17] librte_acl: add AVX2 as new rte_acl_classify() method Konstantin Ananyev
2014-12-15 16:00   ` Neil Horman
2014-12-15 16:33     ` Ananyev, Konstantin
2014-12-15 20:20       ` Neil Horman
2014-12-16 16:16         ` Ananyev, Konstantin
2014-12-17 15:32           ` Neil Horman
2014-12-17 19:22             ` Ananyev, Konstantin
2014-12-17 20:27               ` Neil Horman
2014-12-18 15:01                 ` Ananyev, Konstantin
2015-01-06  9:57                   ` Ananyev, Konstantin
2015-01-06 12:40                     ` Neil Horman
2014-12-17  0:38         ` Ananyev, Konstantin
2014-12-14 18:10 ` [dpdk-dev] [PATCH 11/17] test-acl: add ability to manually select RT method Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 12/17] librte_acl: Remove search_sse_2 and relatives Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 13/17] libter_acl: move lo/hi dwords shuffle out from calc_addr Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 14/17] libte_acl: make calc_addr a define to deduplicate the code Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 15/17] libte_acl: introduce max_size into rte_acl_config Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 16/17] libte_acl: remove unused macros Konstantin Ananyev
2014-12-14 18:10 ` [dpdk-dev] [PATCH 17/17] libte_acl: fix compilation issues with RTE_LIBRTE_ACL_STANDALONE=y Konstantin Ananyev
2014-12-16 13:51   ` Neil Horman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20141215155630.GB3803@hmsreliant.think-freely.org \
    --to=nhorman@tuxdriver.com \
    --cc=dev@dpdk.org \
    --cc=konstantin.ananyev@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).