DPDK patches and discussions
 help / color / mirror / Atom feed
From: <zbigniew.bodek@caviumnetworks.com>
To: <pablo.de.lara.guarch@intel.com>, <jerin.jacob@caviumnetworks.com>
Cc: <dev@dpdk.org>,
	Zbigniew Bodek <zbigniew.bodek@caviumnetworks.com>,
	Emery Davis <emery.davis@caviumnetworks.com>
Subject: [dpdk-dev] [PATCH v2 03/12] crypto/armv8: Add core crypto operations for ARMv8
Date: Tue, 6 Dec 2016 18:32:56 -0800	[thread overview]
Message-ID: <1481077985-4224-4-git-send-email-zbigniew.bodek@caviumnetworks.com> (raw)
In-Reply-To: <1481077985-4224-1-git-send-email-zbigniew.bodek@caviumnetworks.com>

From: Zbigniew Bodek <zbigniew.bodek@caviumnetworks.com>

This patch adds core low-level crypto operations
for ARMv8 processors. The assembly code is a base
for an optimized PMD and is currently excluded
from the build.

Standalone SHA1 and SHA256 are provided to support
partial hashing of inner/outer key+padding and
authentication keys longer than 160/256 bits.
Optimized AES key schedule is also included.

Signed-off-by: Zbigniew Bodek <zbigniew.bodek@caviumnetworks.com>
Signed-off-by: Emery Davis <emery.davis@caviumnetworks.com>
---
 drivers/crypto/armv8/asm/aes_core.S    | 151 ++++++++++
 drivers/crypto/armv8/asm/sha1_core.S   | 518 ++++++++++++++++++++++++++++++++
 drivers/crypto/armv8/asm/sha256_core.S | 525 +++++++++++++++++++++++++++++++++
 3 files changed, 1194 insertions(+)
 create mode 100644 drivers/crypto/armv8/asm/aes_core.S
 create mode 100644 drivers/crypto/armv8/asm/sha1_core.S
 create mode 100644 drivers/crypto/armv8/asm/sha256_core.S

diff --git a/drivers/crypto/armv8/asm/aes_core.S b/drivers/crypto/armv8/asm/aes_core.S
new file mode 100644
index 0000000..b7ceae6
--- /dev/null
+++ b/drivers/crypto/armv8/asm/aes_core.S
@@ -0,0 +1,151 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+	.file	"aes_core.S"
+	.text
+	.cpu generic+fp+simd+crypto+crc
+	.align	4
+	.global	aes128_key_sched_enc
+	.type	aes128_key_sched_enc, %function
+	.global	aes128_key_sched_dec
+	.type	aes128_key_sched_dec, %function
+
+	/*
+	 * AES key expand algorithm for single round.
+	 */
+	.macro	key_expand res, key, shuffle_mask, rcon, tq0, tq1, td
+	/* temp = rotword(key[3]) */
+	tbl	\td\().8b,{\key\().16b},\shuffle_mask\().8b
+	dup	\tq0\().2d,\td\().d[0]
+	/* temp = subbytes(temp) */
+	aese	\tq0\().16b,v19\().16b			/* q19 := 0 */
+	/* temp = temp + rcon */
+	mov	w11,\rcon
+	dup	\tq1\().4s,w11
+	eor	\tq0\().16b,\tq0\().16b,\tq1\().16b
+	/* tq1 = [0, a, b, c] */
+	ext	\tq1\().16b,v19\().16b,\key\().16b,12  	/* q19 := 0 */
+	eor	\res\().16b,\key\().16b,\tq1\().16b
+	/* tq1 = [0, 0, a, b] */
+	ext	\tq1\().16b,v19\().16b,\tq1\().16b,12  	/* q19 := 0 */
+	eor	\res\().16b,\res\().16b,\tq1\().16b
+	/* tq1 = [0, 0, 0, a] */
+	ext	\tq1\().16b,v19\().16b,\tq1\().16b,12	/* q19 := 0 */
+	eor	\res\().16b,\res\().16b,\tq1\().16b
+	/* + temp */
+	eor	\res\().16b,\res\().16b,\tq0\().16b
+	.endm
+/*
+ * *expanded_key, *user_key
+ */
+	.align	4
+aes128_key_sched_enc:
+	sub	sp,sp,4*16
+	st1	{v8.16b - v11.16b},[sp]
+	ld1	{v0.16b},[x1]				/* user_key */
+	mov	w10,0x0e0d				/* form shuffle_word */
+	mov	w11,0x0c0f
+	orr	w10,w10,w11,lsl 16
+	dup	v20.4s,w10				/* shuffle_mask */
+	eor	v19.16b,v19.16b,v19.16b			/* zero */
+	/* Expand key */
+	key_expand v1,v0,v20,0x1,v21,v16,v17
+	key_expand v2,v1,v20,0x2,v21,v16,v17
+	key_expand v3,v2,v20,0x4,v21,v16,v17
+	key_expand v4,v3,v20,0x8,v21,v16,v17
+	key_expand v5,v4,v20,0x10,v21,v16,v17
+	key_expand v6,v5,v20,0x20,v21,v16,v17
+	key_expand v7,v6,v20,0x40,v21,v16,v17
+	key_expand v8,v7,v20,0x80,v21,v16,v17
+	key_expand v9,v8,v20,0x1b,v21,v16,v17
+	key_expand v10,v9,v20,0x36,v21,v16,v17
+	/* Store round keys in the correct order */
+	st1	{v0.16b - v3.16b},[x0],64
+	st1	{v4.16b - v7.16b},[x0],64
+	st1	{v8.16b - v10.16b},[x0],48
+
+	ld1	{v8.16b - v11.16b},[sp]
+	add	sp,sp,4*16
+	ret
+
+	.size	aes128_key_sched_enc, .-aes128_key_sched_enc
+
+/*
+ * *expanded_key, *user_key
+ */
+	.align	4
+aes128_key_sched_dec:
+	sub	sp,sp,4*16
+	st1	{v8.16b-v11.16b},[sp]
+	ld1	{v0.16b},[x1]				/* user_key */
+	mov	w10,0x0e0d				/* form shuffle_word */
+	mov	w11,0x0c0f
+	orr	w10,w10,w11,lsl 16
+	dup	v20.4s,w10				/* shuffle_mask */
+	eor	v19.16b,v19.16b,v19.16b			/* zero */
+	/*
+	 * Expand key.
+	 * Intentionally reverse registers order to allow
+	 * for multiple store later.
+	 * (Store must be performed in the ascending registers' order)
+	 */
+	key_expand v10,v0,v20,0x1,v21,v16,v17
+	key_expand v9,v10,v20,0x2,v21,v16,v17
+	key_expand v8,v9,v20,0x4,v21,v16,v17
+	key_expand v7,v8,v20,0x8,v21,v16,v17
+	key_expand v6,v7,v20,0x10,v21,v16,v17
+	key_expand v5,v6,v20,0x20,v21,v16,v17
+	key_expand v4,v5,v20,0x40,v21,v16,v17
+	key_expand v3,v4,v20,0x80,v21,v16,v17
+	key_expand v2,v3,v20,0x1b,v21,v16,v17
+	key_expand v1,v2,v20,0x36,v21,v16,v17
+	/* Inverse mixcolumns for keys 1-9 (registers v10-v2) */
+	aesimc	v10.16b, v10.16b
+	aesimc	v9.16b, v9.16b
+	aesimc	v8.16b, v8.16b
+	aesimc	v7.16b, v7.16b
+	aesimc	v6.16b, v6.16b
+	aesimc	v5.16b, v5.16b
+	aesimc	v4.16b, v4.16b
+	aesimc	v3.16b, v3.16b
+	aesimc	v2.16b, v2.16b
+	/* Store round keys in the correct order */
+	st1	{v1.16b - v4.16b},[x0],64
+	st1	{v5.16b - v8.16b},[x0],64
+	st1	{v9.16b, v10.16b},[x0],32
+	st1	{v0.16b},[x0],16
+
+	ld1	{v8.16b - v11.16b},[sp]
+	add	sp,sp,4*16
+	ret
+
+	.size	aes128_key_sched_dec, .-aes128_key_sched_dec
diff --git a/drivers/crypto/armv8/asm/sha1_core.S b/drivers/crypto/armv8/asm/sha1_core.S
new file mode 100644
index 0000000..283c946
--- /dev/null
+++ b/drivers/crypto/armv8/asm/sha1_core.S
@@ -0,0 +1,518 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-1 Primitives
+ *
+ * Operations:
+ * sha1_block_partial:
+ * 	out = partial_sha1(init, in, len)	<- no final block
+ *
+ * sha1_block:
+ * 	out = sha1(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha1_block_partial(uint8_t *init,
+ *			uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha1_block(uint8_t *init,
+ *			uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (success), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha1_block_partial(
+ *	init,			x0	(hash init state - NULL for default)
+ *	dsrc,			x1	(digest src address)
+ *	ddst,			x2	(digest dst address)
+ *	len,			x3	(length)
+ *	)
+ *
+ * sha1_block(
+ *	init,			x0	(hash init state - NULL for default)
+ *	dsrc,			x1	(digest src address)
+ *	ddst,			x2	(digest dst address)
+ *	len,			x3	(length)
+ *	)
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v22 -- sha working state ABCD (q22)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16 (+20 for the HMAC),
+ * otherwise error code is returned.
+ *
+ */
+	.file "sha1_core.S"
+	.text
+	.cpu generic+fp+simd+crypto+crc
+	.align	4
+	.global sha1_block_partial
+	.type	sha1_block_partial,%function
+	.global sha1_block
+	.type	sha1_block,%function
+
+	.align	4
+.Lrcon:
+	.word		0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+	.word		0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+	.word		0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+	.word		0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+	.align	4
+.Linit_sha_state:
+	.word		0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476
+	.word		0xc3d2e1f0, 0x00000000, 0x00000000, 0x00000000
+
+	.align	4
+
+sha1_block_partial:
+	mov		x6, #1			/* indicate partial hash */
+	ands		x5, x3, #0x3f		/* Check size mod 1 SHA block */
+	b.ne		.Lsha1_error
+	cbnz		x0, 1f
+	/* address of sha init state consts */
+	adr		x0,.Linit_sha_state
+1:
+	ld1		{v24.4s},[x0],16	/* init ABCD */
+	ld1		{v25.4s},[x0]		/* and E */
+
+	/* Load SHA-1 constants */
+	adr		x4,.Lrcon
+	ld1		{v4.16b},[x4],16	/* key0 */
+	ld1		{v5.16b},[x4],16	/* key1 */
+	ld1		{v6.16b},[x4],16	/* key2 */
+	ld1		{v7.16b},[x4],16	/* key3 */
+
+	lsr		x5, x3, 2		/* number of 4B blocks */
+	b		.Lsha1_loop
+
+sha1_block:
+	mov		x6, xzr		/* indicate full hash */
+	and		x5, x3, #0xf	/* check size mod 16B block */
+	cmp		x5, #4		/* additional word is accepted */
+	b.eq		1f
+	cbnz		x5, .Lsha1_error
+1:
+	cbnz		x0, 2f
+	/* address of sha init state consts */
+	adr		x0,.Linit_sha_state
+2:
+	ld1		{v24.4s},[x0],16	/* init ABCD */
+	ld1		{v25.4s},[x0]		/* and E */
+
+	/* Load SHA-1 constants */
+	adr		x4,.Lrcon
+	ld1		{v4.16b},[x4],16	/* key0 */
+	ld1		{v5.16b},[x4],16	/* key1 */
+	ld1		{v6.16b},[x4],16	/* key2 */
+	ld1		{v7.16b},[x4],16	/* key3 */
+
+	lsr		x5, x3, 2		/* number of 4B blocks */
+	/* at least 16 4B blocks give 1 SHA block */
+	cmp		x5, #16
+	b.lo		.Lsha1_last
+
+	.align	4
+
+.Lsha1_loop:
+	sub		x5, x5, #16		/* substract 1 SHA block */
+
+	ld1		{v26.16b},[x1],16	/* dsrc[0] */
+	ld1		{v27.16b},[x1],16	/* dsrc[1] */
+	ld1		{v28.16b},[x1],16	/* dsrc[2] */
+	ld1		{v29.16b},[x1],16	/* dsrc[3] */
+
+	rev32		v26.16b,v26.16b		/* fix endian w0 */
+	rev32		v27.16b,v27.16b		/* fix endian w1 */
+	rev32		v28.16b,v28.16b		/* fix endian w2 */
+	rev32		v29.16b,v29.16b		/* fix endian w3 */
+
+	mov		v22.16b,v24.16b		/* working ABCD <- ABCD */
+/* quad 0 */
+	add		v16.4s,v4.4s,v26.4s
+	sha1h		s19,s24
+	sha1c		q24,s25,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+
+	add		v17.4s,v4.4s,v27.4s
+	sha1h		s18,s24
+	sha1c		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+
+	add		v16.4s,v4.4s,v28.4s
+	sha1h		s19,s24
+	sha1c		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+
+	add		v17.4s,v4.4s,v29.4s
+	sha1h		s18,s24
+	sha1c		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v4.4s,v26.4s
+	sha1h		s19,s24
+	sha1c		q24,s18,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+/* quad 1 */
+	add		v17.4s,v5.4s,v27.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+
+	add		v16.4s,v5.4s,v28.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+
+	add		v17.4s,v5.4s,v29.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v5.4s,v26.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+
+	add		v17.4s,v5.4s,v27.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+/* quad 2 */
+	add		v16.4s,v6.4s,v28.4s
+	sha1h		s19,s24
+	sha1m		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+
+	add		v17.4s,v6.4s,v29.4s
+	sha1h		s18,s24
+	sha1m		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v6.4s,v26.4s
+	sha1h		s19,s24
+	sha1m		q24,s18,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+
+	add		v17.4s,v6.4s,v27.4s
+	sha1h		s18,s24
+	sha1m		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+
+	add		v16.4s,v6.4s,v28.4s
+	sha1h		s19,s24
+	sha1m		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+/* quad 3 */
+	add		v17.4s,v7.4s,v29.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v7.4s,v26.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+
+	add		v17.4s,v7.4s,v27.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+
+	add		v16.4s,v7.4s,v28.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+
+	add		v17.4s,v7.4s,v29.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+
+	add		v24.4s,v24.4s,v22.4s
+	add		v25.4s,v25.4s,v18.4s
+
+	cmp		x5, #16
+	b.hs		.Lsha1_loop
+
+	/* Store partial hash and return or complete hash */
+	cbz		x6, .Lsha1_last
+
+	st1		{v24.16b},[x2],16
+	st1		{v25.16b},[x2]
+
+	mov		x0, xzr
+	ret
+
+	/*
+	 * Last block with padding. v24-v25[0] contain hash state.
+	 */
+.Lsha1_last:
+
+	eor		v26.16b, v26.16b, v26.16b
+	eor		v27.16b, v27.16b, v27.16b
+	eor		v28.16b, v28.16b, v28.16b
+	eor		v29.16b, v29.16b, v29.16b
+
+	adr		x4,.Lrcon
+	/* Number of bits in message */
+	lsl		x3, x3, 3
+
+	mov		v22.16b,v24.16b		/* working ABCD <- ABCD */
+	/* move length to the end of the block */
+	mov		v29.s[3], w3
+	lsr		x3, x3, 32
+	/* and the higher part */
+	mov		v29.s[2], w3
+
+	/* The remaining part is up to 3 16B blocks and up to 1 4B block */
+	mov		w6, #0x80		/* that's the 1 of the pad */
+	mov		v26.b[3], w6
+	cbz		x5,.Lsha1_final
+	/* Are there 3 16B blocks? */
+	cmp		x5, #12
+	b.lo		1f
+	ld1		{v26.16b},[x1],16
+	ld1		{v27.16b},[x1],16
+	ld1		{v28.16b},[x1],16
+	rev32		v26.16b, v26.16b
+	rev32		v27.16b, v27.16b
+	rev32		v28.16b, v28.16b
+	sub		x5,x5,#12
+	mov		v29.b[7], w6
+	cbz		x5,.Lsha1_final
+	mov		v29.b[7], wzr
+	ld1		{v29.s}[0],[x1],4
+	rev32		v29.16b,v29.16b
+	mov		v29.b[7], w6
+	b		.Lsha1_final
+1:
+	/* Are there 2 16B blocks? */
+	cmp		x5, #8
+	b.lo		2f
+	ld1		{v26.16b},[x1],16
+	ld1		{v27.16b},[x1],16
+	rev32		v26.16b,v26.16b
+	rev32		v27.16b,v27.16b
+	sub		x5,x5,#8
+	mov		v28.b[7], w6
+	cbz		x5,.Lsha1_final
+	mov		v28.b[7], wzr
+	ld1		{v28.s}[0],[x1],4
+	rev32		v28.16b,v28.16b
+	mov		v28.b[7], w6
+	b		.Lsha1_final
+2:
+	/* Is there 1 16B block? */
+	cmp		x5, #4
+	b.lo		3f
+	ld1		{v26.16b},[x1],16
+	rev32		v26.16b,v26.16b
+	sub		x5,x5,#4
+	mov		v27.b[7], w6
+	cbz		x5,.Lsha1_final
+	mov		v27.b[7], wzr
+	ld1		{v27.s}[0],[x1],4
+	rev32		v27.16b,v27.16b
+	mov		v27.b[7], w6
+	b		.Lsha1_final
+3:
+	ld1		{v26.s}[0],[x1],4
+	rev32		v26.16b,v26.16b
+	mov		v26.b[7], w6
+
+.Lsha1_final:
+	ld1		{v4.16b},[x4],16	/* key0 */
+	ld1		{v5.16b},[x4],16	/* key1 */
+	ld1		{v6.16b},[x4],16	/* key2 */
+	ld1		{v7.16b},[x4],16	/* key3 */
+/* quad 0 */
+	add		v16.4s,v4.4s,v26.4s
+	sha1h		s19,s24
+	sha1c		q24,s25,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+
+	add		v17.4s,v4.4s,v27.4s
+	sha1h		s18,s24
+	sha1c		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+
+	add		v16.4s,v4.4s,v28.4s
+	sha1h		s19,s24
+	sha1c		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+
+	add		v17.4s,v4.4s,v29.4s
+	sha1h		s18,s24
+	sha1c		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v4.4s,v26.4s
+	sha1h		s19,s24
+	sha1c		q24,s18,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+/* quad 1 */
+	add		v17.4s,v5.4s,v27.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+
+	add		v16.4s,v5.4s,v28.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+
+	add		v17.4s,v5.4s,v29.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v5.4s,v26.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+
+	add		v17.4s,v5.4s,v27.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+/* quad 2 */
+	add		v16.4s,v6.4s,v28.4s
+	sha1h		s19,s24
+	sha1m		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+
+	add		v17.4s,v6.4s,v29.4s
+	sha1h		s18,s24
+	sha1m		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v6.4s,v26.4s
+	sha1h		s19,s24
+	sha1m		q24,s18,v16.4s
+	sha1su0		v26.4s,v27.4s,v28.4s
+	sha1su1		v26.4s,v29.4s
+
+	add		v17.4s,v6.4s,v27.4s
+	sha1h		s18,s24
+	sha1m		q24,s19,v17.4s
+	sha1su0		v27.4s,v28.4s,v29.4s
+	sha1su1		v27.4s,v26.4s
+
+	add		v16.4s,v6.4s,v28.4s
+	sha1h		s19,s24
+	sha1m		q24,s18,v16.4s
+	sha1su0		v28.4s,v29.4s,v26.4s
+	sha1su1		v28.4s,v27.4s
+/* quad 3 */
+	add		v17.4s,v7.4s,v29.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+	sha1su0		v29.4s,v26.4s,v27.4s
+	sha1su1		v29.4s,v28.4s
+
+	add		v16.4s,v7.4s,v26.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+
+	add		v17.4s,v7.4s,v27.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+
+	add		v16.4s,v7.4s,v28.4s
+	sha1h		s19,s24
+	sha1p		q24,s18,v16.4s
+
+	add		v17.4s,v7.4s,v29.4s
+	sha1h		s18,s24
+	sha1p		q24,s19,v17.4s
+
+	add		v25.4s,v25.4s,v18.4s
+	add		v24.4s,v24.4s,v22.4s
+
+	rev32		v24.16b,v24.16b
+	rev32		v25.16b,v25.16b
+
+	st1		{v24.16b}, [x2],16
+	st1		{v25.s}[0], [x2]
+
+	mov		x0, xzr
+	ret
+
+.Lsha1_error:
+	mov		x0, #-1
+	ret
+
+	.size	sha1_block_partial, .-sha1_block_partial
+	.size	sha1_block, .-sha1_block
diff --git a/drivers/crypto/armv8/asm/sha256_core.S b/drivers/crypto/armv8/asm/sha256_core.S
new file mode 100644
index 0000000..2b2da7f
--- /dev/null
+++ b/drivers/crypto/armv8/asm/sha256_core.S
@@ -0,0 +1,525 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-2 Primitives
+ *
+ * Operations:
+ * sha256_block_partial:
+ * 	out = partial_sha256(init, in, len)	<- no final block
+ *
+ * sha256_block:
+ * 	out = sha256(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha256_block_partial(uint8_t *init,
+ *			uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha256_block(uint8_t *init,
+ *			uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (success), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha256_block_partial(
+ *	init,			x0	(hash init state - NULL for default)
+ *	dsrc,			x1	(digest src address)
+ *	ddst,			x2	(digest dst address)
+ *	len,			x3	(length)
+ *	)
+ *
+ * sha256_block(
+ *	init,			x0	(hash init state - NULL for default)
+ *	dsrc,			x1	(digest src address)
+ *	ddst,			x2	(digest dst address)
+ *	len,			x3	(length)
+ *	)
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16,
+ * otherwise error code is returned.
+ *
+ */
+	.file "sha256_core.S"
+	.text
+	.cpu generic+fp+simd+crypto+crc
+	.align	4
+	.global sha256_block_partial
+	.type	sha256_block_partial,%function
+	.global sha256_block
+	.type	sha256_block,%function
+
+	.align	4
+.Lrcon:
+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+	.align	4
+.Linit_sha_state:
+	.word		0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+	.word		0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+	.align	4
+
+sha256_block_partial:
+	mov		x6, #1			/* indicate partial hash */
+	ands		x5, x3, #0x3f		/* check size mod 1 SHA block */
+	b.ne		.Lsha256_error
+	cbnz		x0, 1f
+	/* address of sha init state consts */
+	adr		x0,.Linit_sha_state
+1:
+	ld1		{v24.4s, v25.4s},[x0]	/* init ABCD, EFGH */
+	/* number of 16B blocks (will be at least 4) */
+	lsr		x5, x3, 4
+	b		.Lsha256_loop
+
+sha256_block:
+	mov		x6, xzr			/* indicate full hash */
+	ands		x5, x3, #0xf		/* check size mod 16B block */
+	b.ne		.Lsha256_error
+	cbnz		x0, 1f
+	/* address of sha init state consts */
+	adr		x0,.Linit_sha_state
+1:
+	ld1		{v24.4s, v25.4s},[x0]	/* init ABCD, EFGH. (2 cycs) */
+	lsr		x5, x3, 4		/* number of 16B blocks */
+	cmp		x5, #4	/* at least 4 16B blocks give 1 SHA block */
+	b.lo		.Lsha256_last
+
+	.align	4
+.Lsha256_loop:
+	sub		x5, x5, #4		/* substract 1 SHA block */
+	adr		x4,.Lrcon
+
+	ld1		{v26.16b},[x1],16	/* dsrc[0] */
+	ld1		{v27.16b},[x1],16	/* dsrc[1] */
+	ld1		{v28.16b},[x1],16	/* dsrc[2] */
+	ld1		{v29.16b},[x1],16	/* dsrc[3] */
+
+	rev32		v26.16b,v26.16b		/* fix endian w0 */
+	rev32		v27.16b,v27.16b		/* fix endian w1 */
+	rev32		v28.16b,v28.16b		/* fix endian w2 */
+	rev32		v29.16b,v29.16b		/* fix endian w3 */
+
+	mov		v22.16b,v24.16b		/* working ABCD <- ABCD */
+	mov		v23.16b,v25.16b		/* working EFGH <- EFGH */
+
+	ld1		{v4.16b},[x4],16	/* key0 */
+	ld1		{v5.16b},[x4],16	/* key1 */
+	ld1		{v6.16b},[x4],16	/* key2 */
+	ld1		{v7.16b},[x4],16	/* key3 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key0+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+	sha256su0	v26.4s,v27.4s
+	sha256su1	v26.4s,v28.4s,v29.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key1+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+	sha256su0	v27.4s,v28.4s
+	sha256su1	v27.4s,v29.4s,v26.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key2+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+	sha256su0	v28.4s,v29.4s
+	sha256su1	v28.4s,v26.4s,v27.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key3+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+	sha256su0	v29.4s,v26.4s
+	sha256su1	v29.4s,v27.4s,v28.4s
+
+	ld1		{v4.16b},[x4],16	/* key4 */
+	ld1		{v5.16b},[x4],16	/* key5 */
+	ld1		{v6.16b},[x4],16	/* key6 */
+	ld1		{v7.16b},[x4],16	/* key7 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key4+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+	sha256su0	v26.4s,v27.4s
+	sha256su1	v26.4s,v28.4s,v29.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key5+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+	sha256su0	v27.4s,v28.4s
+	sha256su1	v27.4s,v29.4s,v26.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key6+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+	sha256su0	v28.4s,v29.4s
+	sha256su1	v28.4s,v26.4s,v27.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key7+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+	sha256su0	v29.4s,v26.4s
+	sha256su1	v29.4s,v27.4s,v28.4s
+
+	ld1		{v4.16b},[x4],16	/* key8 */
+	ld1		{v5.16b},[x4],16	/* key9 */
+	ld1		{v6.16b},[x4],16	/* key10 */
+	ld1		{v7.16b},[x4],16	/* key11 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key8+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+	sha256su0	v26.4s,v27.4s
+	sha256su1	v26.4s,v28.4s,v29.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key9+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+	sha256su0	v27.4s,v28.4s
+	sha256su1	v27.4s,v29.4s,v26.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key10+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+	sha256su0	v28.4s,v29.4s
+	sha256su1	v28.4s,v26.4s,v27.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key11+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+	sha256su0	v29.4s,v26.4s
+	sha256su1	v29.4s,v27.4s,v28.4s
+
+	ld1		{v4.16b},[x4],16	/* key12 */
+	ld1		{v5.16b},[x4],16	/* key13 */
+	ld1		{v6.16b},[x4],16	/* key14 */
+	ld1		{v7.16b},[x4],16	/* key15 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key12+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key13+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key14+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key15+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+
+	add		v24.4s,v24.4s,v22.4s	/* ABCD += working copy */
+	add		v25.4s,v25.4s,v23.4s	/* EFGH += working copy */
+
+	cmp		x5, #4
+	b.hs		.Lsha256_loop
+
+	/* Store partial hash and return or complete hash */
+	cbz		x6, .Lsha256_last
+
+	st1		{v24.16b, v25.16b}, [x2]
+
+	mov		x0, xzr
+	ret
+
+	/*
+	 * Last block with padding. v24-v25 contain hash state.
+	 */
+.Lsha256_last:
+	eor		v26.16b, v26.16b, v26.16b
+	eor		v27.16b, v27.16b, v27.16b
+	eor		v28.16b, v28.16b, v28.16b
+	eor		v29.16b, v29.16b, v29.16b
+
+	adr		x4,.Lrcon
+	lsl		x3, x3, 3
+
+	mov		v22.16b,v24.16b		/* working ABCD <- ABCD */
+	mov		v23.16b,v25.16b		/* working EFGH <- EFGH */
+
+	/* Fill out the first vector register and the end of the block */
+
+	/* move length to the end of the block */
+	mov		v29.s[3], w3
+	lsr		x3, x3, 32
+	mov		v29.s[2], w3		/* and the higher part */
+	/* set padding 1 to the first reg */
+	mov		w6, #0x80		/* that's the 1 of the pad */
+	mov		v26.b[3], w6
+	cbz		x5,.Lsha256_final
+
+	sub		x5, x5, #1
+	mov		v27.16b, v26.16b
+	ld1		{v26.16b},[x1],16
+	rev32		v26.16b,v26.16b		/* fix endian w0 */
+	cbz		x5,.Lsha256_final
+
+	sub		x5, x5, #1
+	mov		v28.16b, v27.16b
+	ld1		{v27.16b},[x1],16
+	rev32		v27.16b,v27.16b		/* fix endian w1 */
+	cbz		x5,.Lsha256_final
+
+	mov		v29.b[0], w6
+	ld1		{v28.16b},[x1],16
+	rev32		v28.16b,v28.16b		/* fix endian w2 */
+
+.Lsha256_final:
+
+	ld1		{v4.16b},[x4],16	/* key0 */
+	ld1		{v5.16b},[x4],16	/* key1 */
+	ld1		{v6.16b},[x4],16	/* key2 */
+	ld1		{v7.16b},[x4],16	/* key3 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key0+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+	sha256su0	v26.4s,v27.4s
+	sha256su1	v26.4s,v28.4s,v29.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key1+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+	sha256su0	v27.4s,v28.4s
+	sha256su1	v27.4s,v29.4s,v26.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key2+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+	sha256su0	v28.4s,v29.4s
+	sha256su1	v28.4s,v26.4s,v27.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key3+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+	sha256su0	v29.4s,v26.4s
+	sha256su1	v29.4s,v27.4s,v28.4s
+
+	ld1		{v4.16b},[x4],16	/* key4 */
+	ld1		{v5.16b},[x4],16	/* key5 */
+	ld1		{v6.16b},[x4],16	/* key6 */
+	ld1		{v7.16b},[x4],16	/* key7 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key4+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+	sha256su0	v26.4s,v27.4s
+	sha256su1	v26.4s,v28.4s,v29.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key5+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+	sha256su0	v27.4s,v28.4s
+	sha256su1	v27.4s,v29.4s,v26.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key6+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+	sha256su0	v28.4s,v29.4s
+	sha256su1	v28.4s,v26.4s,v27.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key7+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+	sha256su0	v29.4s,v26.4s
+	sha256su1	v29.4s,v27.4s,v28.4s
+
+	ld1		{v4.16b},[x4],16	/* key8 */
+	ld1		{v5.16b},[x4],16	/* key9 */
+	ld1		{v6.16b},[x4],16	/* key10 */
+	ld1		{v7.16b},[x4],16	/* key11 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key8+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+	sha256su0	v26.4s,v27.4s
+	sha256su1	v26.4s,v28.4s,v29.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key9+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+	sha256su0	v27.4s,v28.4s
+	sha256su1	v27.4s,v29.4s,v26.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key10+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+	sha256su0	v28.4s,v29.4s
+	sha256su1	v28.4s,v26.4s,v27.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key11+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+	sha256su0	v29.4s,v26.4s
+	sha256su1	v29.4s,v27.4s,v28.4s
+
+	ld1		{v4.16b},[x4],16	/* key12 */
+	ld1		{v5.16b},[x4],16	/* key13 */
+	ld1		{v6.16b},[x4],16	/* key14 */
+	ld1		{v7.16b},[x4],16	/* key15 */
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v4.4s,v4.4s,v26.4s	/* wk = key12+w0 */
+	sha256h		q22, q23, v4.4s
+	sha256h2	q23, q21, v4.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v5.4s,v5.4s,v27.4s	/* wk = key13+w1 */
+	sha256h		q22, q23, v5.4s
+	sha256h2	q23, q21, v5.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v6.4s,v6.4s,v28.4s	/* wk = key14+w2 */
+	sha256h		q22, q23, v6.4s
+	sha256h2	q23, q21, v6.4s
+
+	mov		v21.16b, v22.16b	/* copy abcd */
+
+	add		v7.4s,v7.4s,v29.4s	/* wk = key15+w3 */
+	sha256h		q22, q23, v7.4s
+	sha256h2	q23, q21, v7.4s
+
+	add		v24.4s,v24.4s,v22.4s	/* ABCD += working copy */
+	add		v25.4s,v25.4s,v23.4s	/* EFGH += working copy */
+
+	rev32		v24.16b, v24.16b
+	rev32		v25.16b, v25.16b
+	st1		{v24.4s,v25.4s},[x2]	/* save them both */
+
+	mov		x0, xzr
+	ret
+
+.Lsha256_error:
+	mov		x0, #-1
+	ret
+
+	.size	sha256_block_partial, .-sha256_block_partial
-- 
1.9.1

  parent reply	other threads:[~2016-12-06 17:34 UTC|newest]

Thread overview: 100+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-12-04 11:33 [dpdk-dev] [PATCH] Add crypto PMD optimized " zbigniew.bodek
2016-12-04 11:33 ` [dpdk-dev] [PATCH 1/3] mk: fix build of assembly files for ARM64 zbigniew.bodek
2016-12-04 11:33 ` [dpdk-dev] [PATCH 2/3] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2016-12-04 11:33 ` [dpdk-dev] [PATCH 3/3] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2016-12-07  2:32 ` [dpdk-dev] [PATCH v2 00/12] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2016-12-07  2:32   ` [dpdk-dev] [PATCH v2 01/12] mk: fix build of assembly files for ARM64 zbigniew.bodek
2016-12-21 14:46     ` De Lara Guarch, Pablo
2017-01-04 17:33     ` [dpdk-dev] [PATCH v3 0/8] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 1/8] mk: fix build of assembly files for ARM64 zbigniew.bodek
2017-01-13  8:13         ` Hemant Agrawal
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 2/8] lib: add cryptodev type for the upcoming ARMv8 PMD zbigniew.bodek
2017-01-13  8:16         ` Hemant Agrawal
2017-01-13 15:50           ` Zbigniew Bodek
2017-01-16  5:57           ` Jianbo Liu
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 3/8] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-06  2:45         ` Jianbo Liu
2017-01-12 13:12           ` Zbigniew Bodek
2017-01-13  7:41             ` Jianbo Liu
2017-01-13 19:09               ` Zbigniew Bodek
2017-01-13  7:57         ` Hemant Agrawal
2017-01-13 19:15           ` Zbigniew Bodek
2017-01-17 15:48         ` [dpdk-dev] [PATCH v4 0/7] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-17 15:48           ` [dpdk-dev] [PATCH v4 1/7] lib: add cryptodev type for the upcoming ARMv8 PMD zbigniew.bodek
2017-01-18  2:24             ` Jerin Jacob
2017-01-17 15:48           ` [dpdk-dev] [PATCH v4 2/7] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-18 14:27             ` [dpdk-dev] [PATCH v5 0/7] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-18 14:27               ` [dpdk-dev] [PATCH v5 1/7] cryptodev: add cryptodev type for the ARMv8 PMD zbigniew.bodek
2017-01-18 14:27               ` [dpdk-dev] [PATCH v5 2/7] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-18 20:01                 ` [dpdk-dev] [PATCH v6 0/8] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-18 20:01                   ` [dpdk-dev] [PATCH v6 1/8] cryptodev: add cryptodev type for the ARMv8 PMD zbigniew.bodek
2017-01-18 20:01                   ` [dpdk-dev] [PATCH v6 2/8] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-18 20:01                   ` [dpdk-dev] [PATCH v6 3/8] mk: add PMD to the build system zbigniew.bodek
2017-01-18 20:01                   ` [dpdk-dev] [PATCH v6 4/8] cryptodev/armv8: introduce ARM-specific feature flags zbigniew.bodek
2017-01-18 20:01                   ` [dpdk-dev] [PATCH v6 5/8] doc: update documentation about ARMv8 crypto PMD zbigniew.bodek
2017-01-18 20:01                   ` [dpdk-dev] [PATCH v6 6/8] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-18 20:02                   ` [dpdk-dev] [PATCH v6 7/8] MAINTAINERS: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-18 20:02                   ` [dpdk-dev] [PATCH v6 8/8] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-18 21:14                   ` [dpdk-dev] [PATCH v6 0/8] Add crypto PMD optimized for ARMv8 De Lara Guarch, Pablo
2017-01-19 10:36                     ` Zbigniew Bodek
2017-01-18 14:27               ` [dpdk-dev] [PATCH v5 3/7] mk: add PMD to the build system zbigniew.bodek
2017-01-18 14:27               ` [dpdk-dev] [PATCH v5 4/7] doc: update documentation about ARMv8 crypto PMD zbigniew.bodek
2017-01-18 17:05                 ` De Lara Guarch, Pablo
2017-01-18 19:52                   ` Zbigniew Bodek
2017-01-18 19:54                     ` De Lara Guarch, Pablo
2017-01-18 14:27               ` [dpdk-dev] [PATCH v5 5/7] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-18 14:27               ` [dpdk-dev] [PATCH v5 6/7] MAINTAINERS: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-18 14:27               ` [dpdk-dev] [PATCH v5 7/7] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-18 15:23               ` [dpdk-dev] [PATCH v5 0/7] Add crypto PMD optimized for ARMv8 Jerin Jacob
2017-01-17 15:48           ` [dpdk-dev] [PATCH v4 3/7] mk: add PMD to the build system zbigniew.bodek
2017-01-17 15:49           ` [dpdk-dev] [PATCH v4 4/7] doc: update documentation about ARMv8 crypto PMD zbigniew.bodek
2017-01-17 15:49           ` [dpdk-dev] [PATCH v4 5/7] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-17 15:49           ` [dpdk-dev] [PATCH v4 6/7] MAINTAINERS: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-17 15:49           ` [dpdk-dev] [PATCH v4 7/7] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-18  2:26             ` Jerin Jacob
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 4/8] mk/crypto/armv8: add PMD to the build system zbigniew.bodek
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 5/8] doc/armv8: update documentation about crypto PMD zbigniew.bodek
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 6/8] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 7/8] crypto/armv8: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-04 17:33       ` [dpdk-dev] [PATCH v3 8/8] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-12 10:48         ` De Lara Guarch, Pablo
2017-01-12 11:50           ` Zbigniew Bodek
2017-01-12 12:07             ` De Lara Guarch, Pablo
2017-01-13  9:28         ` Hemant Agrawal
2017-01-10 17:11       ` [dpdk-dev] [PATCH v3 0/8] Add crypto PMD optimized for ARMv8 De Lara Guarch, Pablo
2017-01-10 17:50         ` Zbigniew Bodek
2017-01-13  8:07       ` Hemant Agrawal
2017-01-13 18:59         ` Zbigniew Bodek
2017-01-16  6:57           ` Hemant Agrawal
2017-01-16  8:02             ` Jerin Jacob
2016-12-07  2:32   ` [dpdk-dev] [PATCH v2 02/12] lib: add cryptodev type for the upcoming ARMv8 PMD zbigniew.bodek
2016-12-06 20:27     ` Thomas Monjalon
2016-12-07 19:04       ` Zbigniew Bodek
2016-12-07 20:09         ` Thomas Monjalon
2016-12-09 12:06           ` Declan Doherty
2016-12-07  2:32   ` zbigniew.bodek [this message]
2016-12-06 20:29     ` [dpdk-dev] [PATCH v2 03/12] crypto/armv8: Add core crypto operations for ARMv8 Thomas Monjalon
2016-12-06 21:18       ` Jerin Jacob
2016-12-06 21:42         ` Thomas Monjalon
2016-12-06 22:05           ` Jerin Jacob
2016-12-06 22:41             ` Thomas Monjalon
2016-12-06 23:24               ` Jerin Jacob
2016-12-07 15:00                 ` Thomas Monjalon
2016-12-07 16:30                   ` Jerin Jacob
2016-12-07  2:32   ` [dpdk-dev] [PATCH v2 04/12] crypto/armv8: Add AES+SHA256 " zbigniew.bodek
2016-12-07  2:32   ` [dpdk-dev] [PATCH v2 05/12] crypto/armv8: Add AES+SHA1 " zbigniew.bodek
2016-12-07  2:32   ` [dpdk-dev] [PATCH v2 06/12] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2016-12-21 14:55     ` De Lara Guarch, Pablo
2016-12-07  2:33   ` [dpdk-dev] [PATCH v2 07/12] crypto/armv8: generate ASM symbols automatically zbigniew.bodek
2016-12-07  2:33   ` [dpdk-dev] [PATCH v2 08/12] mk/crypto/armv8: add PMD to the build system zbigniew.bodek
2016-12-21 15:01     ` De Lara Guarch, Pablo
2016-12-07  2:33   ` [dpdk-dev] [PATCH v2 09/12] doc/armv8: update documentation about crypto PMD zbigniew.bodek
2016-12-07 21:13     ` Mcnamara, John
2016-12-07  2:33   ` [dpdk-dev] [PATCH v2 10/12] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2016-12-08 10:24   ` [dpdk-dev] [PATCH v2 00/12] Add crypto PMD optimized for ARMv8 Bruce Richardson
2016-12-08 11:32     ` Zbigniew Bodek
2016-12-08 17:45       ` Jerin Jacob
2016-12-21 15:34         ` Declan Doherty
2016-12-22  4:57           ` Jerin Jacob
2016-12-07  2:36 ` [dpdk-dev] [PATCH v2 11/12] crypto/armv8: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2016-12-07  2:37 ` [dpdk-dev] [PATCH v2 12/12] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1481077985-4224-4-git-send-email-zbigniew.bodek@caviumnetworks.com \
    --to=zbigniew.bodek@caviumnetworks.com \
    --cc=dev@dpdk.org \
    --cc=emery.davis@caviumnetworks.com \
    --cc=jerin.jacob@caviumnetworks.com \
    --cc=pablo.de.lara.guarch@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).