From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <neleai@seznam.cz>
Received: from popelka.ms.mff.cuni.cz (popelka.ms.mff.cuni.cz [195.113.20.131])
 by dpdk.org (Postfix) with ESMTP id 978BC5A58
 for <dev@dpdk.org>; Tue,  9 Jun 2015 17:25:32 +0200 (CEST)
Received: from domone.kolej.mff.cuni.cz (popelka.ms.mff.cuni.cz
 [195.113.20.131])
 by popelka.ms.mff.cuni.cz (Postfix) with ESMTPS id 7FB2B42C83;
 Tue,  9 Jun 2015 17:25:28 +0200 (CEST)
Received: by domone.kolej.mff.cuni.cz (Postfix, from userid 1000)
 id AD4B85F7E3; Tue,  9 Jun 2015 17:25:27 +0200 (CEST)
Date: Tue, 9 Jun 2015 17:25:23 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: Ravi Kerur <rkerur@gmail.com>
Message-ID: <20150609152523.GA26355@domone>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)
X-Virus-Scanned: clamav-milter 0.98.5 at popelka.ms.mff.cuni.cz
X-Virus-Status: Clean
X-Spam-Status: No, score=-1.9 required=5.0 tests=AWL,BAYES_00,FREEMAIL_FROM,
 UNPARSEABLE_RELAY autolearn=ham version=3.3.2
X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on
 popelka.ms.mff.cuni.cz
Cc: dev@dpdk.org
Subject: [dpdk-dev] rte_memcmp: comments from glibc side.
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: patches and discussions about DPDK <dev.dpdk.org>
List-Unsubscribe: <http://dpdk.org/ml/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <http://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Tue, 09 Jun 2015 15:25:32 -0000

Hi,

I as glibc developer that wrote current strcmp code have some comments.

First is that gcc builtins for *cmp are garbage that produce rep cmpsb
which is slower than byte-by-byte loop. So compile your test again with
-fno-builtin-memcmp and your performance gain will probably disappear.

Then there is inlining. Its correct to do that for first 32 bytes and I
plan to add header that does that check to improve performance. However
not for bytes after 32'th. Thats very cold code, Only 5.6% calls reach
17th byte and 1.7% of calls read 33'th byte, so just do libcall to save size.

That also makes avx2 pointless, for most string funtions avx2 doesn't
give you gains as xmm for first 64 bytes has better latency and while
loop is faster its also relatively cold as its almost never reached.

For memcmp I posted on gcc list a sample implementation how it should do
inlining. I found that gcc optimizes that better than expected and
produces probably optimal header (see below and feel free to use it).

When you care about sign then its better to load first 8 bytes, convert
them to big endian where can you compare directly. When you don't gcc
managed to optimize away bswap so you check 8 bytes with three
instructions below. Now I think that in header we shouldn't use sse at
all.

 190:	48 8b 4e 08          	mov    0x8(%rsi),%rcx
 194:	48 39 4f 08          	cmp    %rcx,0x8(%rdi)
 198:	75 f3                	jne    18d <memeq30+0xd>

As I mentioned statistics on my computer memcmp has following:

calls 1430827
average n:    7.4    n <= 0:   0.1% n <= 4:  36.3% n <= 8:  78.4% n <=
16:  94.4% n <= 24:  97.3% n <= 32:  98.3% n <= 48:  98.6% n <= 64:
99.9% 
s aligned to 4 bytes:  99.8%  8 bytes:  97.5% 16 bytes:  59.5% 
average *s access cache latency    3.6    l <= 8:  92.0% l <= 16:  96.1%
l <= 32:  98.9% l <= 64:  99.4% l <= 128:  99.5% 
s2 aligned to 4 bytes:  24.1%  8 bytes:  13.1% 16 bytes:   8.2% 
s-s2 aligned to 4 bytes:  24.1%  8 bytes:  15.4% 16 bytes:  10.3% 
average *s2 access cache latency    1.5    l <= 8:  98.0% l <= 16:
99.6% l <= 32:  99.9% l <= 64: 100.0% l <= 128: 100.0% 
average capacity:    8.5    c <= 0:   0.0% c <= 4:  36.0% c <= 8:  78.3%
c <= 16:  91.8% c <= 24:  94.8% c <= 32:  95.7% c <= 48:  96.1% c <= 64:
99.9%

#include <string.h>
#include <stdint.h>

#undef memcmp
#define memcmp(x, y, n) (__builtin_constant_p (n) && n < 64 ? __memcmp_inline (x, y, n) \
			 : memcmp (x, y, n))

#define LOAD8(x) (*((uint8_t *) (x)))
#define LOAD32(x) (*((uint32_t *) (x)))
#define LOAD64(x) (*((uint64_t *) (x)))

#define CHECK(tp, n)
#if __BYTE_ORDER == __LITTLE_ENDIAN
# define SWAP32(x) __builtin_bswap32 (LOAD32 (x))
# define SWAP64(x) __builtin_bswap64 (LOAD64 (x))
#else
# define SWAP32(x) LOAD32 (x)
# define SWAP64(x) LOAD64 (x)
#endif

#define __ARCH_64BIT 1

static __always_inline
int
check (uint64_t x, uint64_t y)
{
  if (x == y)
    return 0;
  if (x > y)
    return 1;

  return -1;
}

static __always_inline
int
check_nonzero (uint64_t x, uint64_t y)
{
  if (x > y)
    return 1;

  return -1;
}


static __always_inline
int
__memcmp_inline (void *x, void *y, size_t n)
{
#define CHECK1 if (LOAD8 (x + i) - LOAD8 (y + i)) \
    return check_nonzero (LOAD8 (x + i), LOAD8 (y + i)); i = i + 1;
#define CHECK4 if (i == 0 ? SWAP32 (x + i) - SWAP32 (y + i)\
                      : LOAD32 (x + i) - LOAD32 (y + i)) \
    return check_nonzero (SWAP32 (x + i), SWAP32 (y + i)); i = i + 4;
#define CHECK8 if (i == 0 ? SWAP64 (x + i) - SWAP64 (y + i)\
                      : LOAD64 (x + i) - LOAD64 (y + i)) \
    return check_nonzero (SWAP64 (x + i), SWAP64 (y + i)); i = i + 8;

#define CHECK1FINAL(o) return check (LOAD8 (x + i + o), LOAD8 (y + i + o));
#define CHECK4FINAL(o) return check (SWAP32 (x + i + o), SWAP32 (y + i + o));
#define CHECK8FINAL(o) return check (SWAP64 (x + i + o), SWAP64 (y + i + o));

#if __ARCH_64BIT == 0
# undef CHECK8
# undef CHECK8FINAL
# define CHECK8 CHECK4 CHECK4
# define CHECK8FINAL(o) CHECK4 CHECK4FINAL (o)
#endif

#define LOOP if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } 


  long i = 0;

  switch (n % 8)
    {
    case 0:
      if (n == 0)
	return 0;

      LOOP; CHECK8FINAL (0);
    case 1:
      LOOP CHECK1FINAL (0);
    case 2:
      if (n == 2)
	{
          CHECK1 CHECK1FINAL (0);
        }
      LOOP CHECK4FINAL (-2);
    case 3:
      if (n == 3)
	{
	  CHECK1 CHECK1 CHECK1FINAL (0);
	}
      LOOP CHECK4FINAL (-1);
    case 4:
      LOOP CHECK4FINAL (0);
    case 5:
      if (n == 5)
	{
	  CHECK4 CHECK1FINAL (0);
	}
#if __ARCH_64BIT
      LOOP CHECK8FINAL (-3);
#else
      LOOP CHECK4 CHECK1FINAL (0);
#endif
    case 6:
      if (n == 6)
	{
	  CHECK4 CHECK4FINAL (-2);
	}
      LOOP CHECK8FINAL (-2);
    case 7:
      if (n == 7)
	{
	  CHECK4 CHECK4FINAL (-1);
	}
      LOOP CHECK8FINAL (-1);
    }
}

int
memcmp1 (char *x, char *y)
{
  return memcmp (x, y, 1);
}
int
memcmp10 (char *x, char *y)
{
  return memcmp (x, y, 10);
}
int
memcmp20 (char *x, char *y)
{
  return memcmp (x, y, 20);
}
int
memcmp30 (char *x, char *y)
{
  return memcmp (x, y, 30);
}

int
memeq1 (char *x, char *y)
{
  return memcmp (x, y, 1) != 0;
}
int
memeq10 (char *x, char *y)
{
  return memcmp (x, y, 10) != 0;
}
int
memeq20 (char *x, char *y)
{
  return memcmp (x, y, 20) != 0;
}
int
memeq30 (char *x, char *y)
{
  return memcmp (x, y, 30) != 0;
}