From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <zhihong.wang@intel.com>
Received: from mga01.intel.com (mga01.intel.com [192.55.52.88])
 by dpdk.org (Postfix) with ESMTP id C6718C40C
 for <dev@dpdk.org>; Thu, 28 Jan 2016 11:11:51 +0100 (CET)
Received: from fmsmga002.fm.intel.com ([10.253.24.26])
 by fmsmga101.fm.intel.com with ESMTP; 28 Jan 2016 02:11:52 -0800
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.22,358,1449561600"; d="scan'208";a="902865121"
Received: from unknown (HELO dpdk5.sh.intel.com) ([10.239.129.244])
 by fmsmga002.fm.intel.com with ESMTP; 28 Jan 2016 02:11:49 -0800
From: Zhihong Wang <zhihong.wang@intel.com>
To: rkerur@gmail.com
Date: Wed, 27 Jan 2016 22:08:26 -0500
Message-Id: <1453950506-28067-1-git-send-email-zhihong.wang@intel.com>
X-Mailer: git-send-email 2.5.0
In-Reply-To: <1431979303-1346-2-git-send-email-rkerur@gmail.com>
References: <1431979303-1346-2-git-send-email-rkerur@gmail.com>
Cc: dev@dpdk.org
Subject: Re: [dpdk-dev] [dpdk-dev,
	v3] Implement memcmp using Intel SIMD instrinsics.
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: patches and discussions about DPDK <dev.dpdk.org>
List-Unsubscribe: <http://dpdk.org/ml/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <http://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Thu, 28 Jan 2016 10:11:52 -0000

> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib
> /librte_eal/common/include/arch/x86/rte_memcmp.h

[...]

> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Compare bytes between two locations. The locations must not overlap.
> + *

Parameter names should be kept consistent as they are in function body.

> + * @param src_1
> + *   Pointer to the first source of the data.
> + * @param src_2
> + *   Pointer to the second source of the data.
> + * @param n
> + *   Number of bytes to compare.
> + * @return
> + *   zero if src_1 equal src_2
> + *   -ve if src_1 less than src_2
> + *   +ve if src_1 greater than src_2
> + */
> +static inline int
> +rte_memcmp(const void *src_1, const void *src,
> +		size_t n) __attribute__((always_inline));
> +
> +/**
> + * Find the first different bit for comparison.
> + */
> +static inline int
> +rte_cmpffd (uint32_t x, uint32_t y)
> +{
> +	int i;
> +	int pos = x ^ y;
> +	for (i = 0; i < 32; i++)
> +		if (pos & (1<<i))

Coding style check :-)
BTW, does the bsf instruction provide this check?

> +			return i;
> +	return -1;
> +}
> +

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Guess this is not used.

[...]

> +/**
> + * Compare 256 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp256(const void *src_1, const void *src_2)
> +{
> +	int ret;
> +
> +	ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
> +			(const uint8_t *)src_2 + 0 * 64);

Why not just use rte_cmp128?


[...]

> +static inline int
> +rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
> +{
> +	const uint8_t *src_1 = (const uint8_t *)_src_1;
> +	const uint8_t *src_2 = (const uint8_t *)_src_2;
> +	int ret = 0;
> +
> +	if (n < 16)
> +		return rte_memcmp_regular(src_1, src_2, n);
> +
> +	if (n <= 32) {
> +		ret = rte_cmp16(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +

Too many conditions here may harm the overall performance.
It's a trade-off thing, all about balancing the overhead.
Just make sure this is tuned based on actual test numbers.


> +	if (n <= 48) {
> +		ret = rte_cmp32(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +
> +	if (n <= 64) {
> +		ret = rte_cmp32(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp16(src_1 + 32, src_2 + 32);
> +
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +
> +	if (n <= 96) {
> +		ret = rte_cmp64(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp16(src_1 + 64, src_2 + 64);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +
> +	if (n <= 128) {
> +		ret = rte_cmp64(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp32(src_1 + 64, src_2 + 64);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp16(src_1 + 96, src_2 + 96);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Not used.

> +{
> +	int ret;
> +
> +	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> +			(const uint8_t *)src_2 + 0 * 16);
> +
> +	if (unlikely(ret != 0))
> +		return ret;
> +
> +	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
> +			(const uint8_t *)src_2 + 1 * 16);
> +
> +	if (unlikely(ret != 0))
> +		return ret;
> +
> +	return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
> +			(const uint8_t *)src_2 + 2 * 16);
> +}
> +
> +/**
> + * Compare 64 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp64(const void *src_1, const void *src_2)
> +{
> +	int ret;
> +
> +	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> +			(const uint8_t *)src_2 + 0 * 16);

Why not rte_cmp32? And use rte_cmp64 for rte_cmp128, and so on.
That should make the code looks clearer.


It'd be great if you could format this patch into a patch set with several
little ones. :-)
Also, the kernel checkpatch is very helpful.
Good coding style and patch organization make it easy for in-depth reviews.