From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from NAM03-DM3-obe.outbound.protection.outlook.com (mail-dm3nam03on0064.outbound.protection.outlook.com [104.47.41.64]) by dpdk.org (Postfix) with ESMTP id CA2095583 for ; Tue, 6 Dec 2016 18:34:51 +0100 (CET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=CAVIUMNETWORKS.onmicrosoft.com; s=selector1-cavium-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version; bh=gDm5Tz1nG7tB0W2QcFessYN46H/y9Yby+IekzqWIEeE=; b=X3YBzf+qiErlhf/HwGI6yujSJzFBGYOk9CyUMk+hZmE7v2eavVjxucqiWDuBzb6fzWVD3y1E10iNOciKIpMoXy1gq3vBxAQH6Nbdl5DZIMJEGg35Lj0BQ9gJqpAvELndLbP/CFV+3LLdsFcrJaCGcvTQOhivC+YCCzGnS7RJ7Oc= Authentication-Results: spf=none (sender IP is ) smtp.mailfrom=Zbigniew.Bodek@cavium.com; Received: from localhost.localdomain (31.172.191.173) by BLUPR07MB547.namprd07.prod.outlook.com (10.141.205.17) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P384) id 15.1.761.9; Tue, 6 Dec 2016 17:34:47 +0000 From: To: , CC: , Zbigniew Bodek , Emery Davis Date: Tue, 6 Dec 2016 18:32:57 -0800 Message-ID: <1481077985-4224-5-git-send-email-zbigniew.bodek@caviumnetworks.com> X-Mailer: git-send-email 1.9.1 In-Reply-To: <1481077985-4224-1-git-send-email-zbigniew.bodek@caviumnetworks.com> References: <1480851219-45071-1-git-send-email-zbigniew.bodek@caviumnetworks.com> <1481077985-4224-1-git-send-email-zbigniew.bodek@caviumnetworks.com> MIME-Version: 1.0 Content-Type: text/plain X-Originating-IP: [31.172.191.173] X-ClientProxiedBy: VI1PR01CA0023.eurprd01.prod.exchangelabs.com (10.162.116.33) To BLUPR07MB547.namprd07.prod.outlook.com (10.141.205.17) X-MS-Office365-Filtering-Correlation-Id: cf654dd3-7905-4911-d4f3-08d41dfe2d10 X-Microsoft-Antispam: UriScan:;BCL:0;PCL:0;RULEID:(22001);SRVR:BLUPR07MB547; X-Microsoft-Exchange-Diagnostics: 1; BLUPR07MB547; 3:FgRjf3qcGHEai+QOXlpkGGUtVgy2+h2Zwf/y2BdYAofXHxeyeYrOhIO6ggo0n1tlRHATXf+YPxpUdjW0vWz88GflRdOw7vVxsaUGkzZCKw60PHVRMKZ5ujttUQyGDjpW5kCkc5x7HhJbdXNruc8TQFsyUbOX6z8scSXo405jekqjR8b3IOajukQ/dvYIAHZcCzsU6OzITFGbN07yRAKLZ1v8HUBrOpDIU9JjNalltG2MoGt/UK/XtSude/9Iq5iF9BCobG5OQZ4UVU9ahtZr1w== X-Microsoft-Exchange-Diagnostics: 1; BLUPR07MB547; 25:0B6jwUTFTvLjbu+HQqcemMCR6DmdKKHD8w96/1H1a0slqRdjDH2s2JOYydBK2aL8GFDgfdc+podfoTqqtHuxMQvUps6vjG71pnyXRg5Fcro/67ci/LnHGTn1Sm62gpEh2yGl0ukOwwE/sOLOYu5mucsPf5tjADtPDNmNoGI8WvJWrg9PJnPJkKIL7x1kYb3LjhWshWC5jrSrLWKNV3+EEjzJGBRpjP36HK1QJX0vyeYwWy8GExbWeXK48llLNCDxQLiS2mQnKnEJ1spHxLu0PLGmfY+5gPTUVDq99pZ65msCKYvKPV/alOLm8SBb5B6tWqJ8vWvu6NEzvH1/UwyUL5OcKMe8QATlxcSReNgQbRfCznWTyvkTATtUZ8pV9vFtc+o8TG5nIzrBqwodCgkMvr2CQaPgLTM1lOHRT4Zi96Sixi+g0505McZC4FSz3nCjhsVKVlweCIbB13S15vHb+HuYRgnvgijg9iQ5Ee08jJ71qKUe27BWJAk9lKq9r6SfV25TRRo0gDSlC9WXNtBTXbnCnpd9V3AzRkdOuUcOOsgiK5ybagLGCuHOh0M8RzmjXB0EXY6hdRzO6IpYtlqWuJ1ZHMxw5OyLjo+eqaO7qj5+TDLVKg+Tm3WF+lutDWxLdvBHVVXMveX2T4p4jWc/vGGcptT9TaOE89UhfD9rp2vbA1/hrOQvWFRDHPicjbBbYvBJAPpwHFhqhmH3rK9dpwDWI80T4DBtlrU9DgLjlU0= X-Microsoft-Exchange-Diagnostics: 1; BLUPR07MB547; 31:m5cguWEh9uwxDiefXQeIRDHRz/WuiePZQIjms3QUrf/W6KY4CNkUkmGjM9BwIT1JoGip4jCPabvq1SHQzezQS3ecxYskCnBZac0pYVHYdLWuPDKyBFBj7libzdQGScycLNA5DuoQtJS+G2DKa69IDQO619MMq/qyhnk3Gk6Rvsm/M3IvSyWWOruqA4k23JKQnoRQdh946k+N4Ad12P04ffJCYVGQpOXrjYSrYoAtf5148OXlRifzT1295mHLobWDGEq8uJFnilZA7SuDJFk0PA==; 20:SS/cDQwCfD3qFb08/Rj1uL+gQIoC5J2o3g4qjMSlSwIsflSDeuOyYOyc79iLF6G2xHwDrVAYvCaDa9VzrzULLgRLkA+PNtTlEnLGr63omilZ3wZiRTIqjZzY/l4VQTYShvf4ZWPMZosUHnxrbTRnO9355aY4tuLLvqdnqV04OLWLr26jxWatIaH4UB7x0Zx9n1/yVogilP5PZrjZGARTGRVjt6qJ03t5uht0MbJEc7cmlbJu5/7u5rOPxM/RU4TvH3YR3Umr7x5C5cMim43oX38iGIhDy+kFZpACqVkLLBXMSSLDgb67uH/xGgHI6V0etqntIL2T/9HUk3M37NSsGLE/B9lHjcre9Qncg0e0xL9n6eex8enY1Zoxh/AZAkFFzHmvM50VHIe3Hu0fwanJuDypCJNJdjBt3jQzJuGiBH8EkQKsykJMT1NSkhWKsOQ4Oyw9kpyB8kQZ8ev2DpGZ8APzPrqo15DC4VZL946zU+eW45e58tCJYjAsdoiWghwmvkp9FS/EZ8VeLcU/r0xWxsUCfilo2l+f+Pe89TJfDcOaLTp1qlfoB/hCENHzmlmogb/lib+Pl8HjIDb4EvWxfx0AKUx5q7ZxiVY7WGfvbNQ= X-Microsoft-Antispam-PRVS: X-Exchange-Antispam-Report-Test: UriScan:; X-Exchange-Antispam-Report-CFA-Test: BCL:0; PCL:0; RULEID:(6040375)(601004)(2401047)(5005006)(8121501046)(3002001)(10201501046)(6041248)(20161123560025)(20161123562025)(20161123564025)(20161123555025)(6072148); SRVR:BLUPR07MB547; BCL:0; PCL:0; RULEID:; SRVR:BLUPR07MB547; X-Microsoft-Exchange-Diagnostics: 1; BLUPR07MB547; 4:d5SA2HlqmtcZd6l7anGHXsXaByvjVyFB8r6xn7v8b2d1MdhXIHJk0tlNLAycKqiIPiUQ6u0ufxJ2q49hqajm/zBuRObcrXvsRx9U0uxg7/Pw20/hA6KduPH7XNWPxfY2QLisEbsK6+MxC20s8XKvgrqf7nRfrcNn8RKzuK6T4S07IbCS8lRufxqrJk4qyGhLV7EitWGLApgFs8PabkrI7Y92zzRwPSjEzMDDI6LWBYr1TDuSgtDy9KuUjA9eMhaIepu959Sfhu+V4oB3LroWHK4sDylzPG4iikmnNp1saF57fxxaBls56uvpcWWgM8BU9j3yzWLrRZnDQ9ko3yhydWgtxwGzMDPMp2PRvMx8Ao7suJXxZvMo93/QKrA6z3oRLQ63FZsGypOX+Svx8R9J8IZfEmnRfE3MiKr5V8fQil+5MkH1oMI7zQa2+dPvAWkiAZuwjQGl+opYRqsdxr1s/RJqDonfACvOoMlmYnnfU0x/1ZEOk3IuVjlRATSd/xS3On1QDvTaxbG0wkr4vBrLP+0rKZ/U075Vy4sGIZZhsHJpuq0Ysy3aHO98OsB+bbPX X-Forefront-PRVS: 01480965DA X-Forefront-Antispam-Report: SFV:NSPM; SFS:(10009020)(4630300001)(6009001)(6069001)(7916002)(50944005)(189002)(199003)(86152002)(33646002)(97736004)(5001770100001)(39450400002)(107886002)(575784001)(4326007)(6506006)(6636002)(50466002)(6862003)(50226002)(6512006)(6486002)(4001430100002)(189998001)(39840400001)(39850400001)(2906002)(2950100002)(2876002)(6666003)(39410400001)(42882006)(5660300001)(733004)(68736007)(8676002)(81156014)(50986999)(3846002)(7736002)(76176999)(106356001)(81166006)(7846002)(92566002)(101416001)(105586002)(48376002)(36756003)(38730400001)(5003940100001)(47776003)(6116002)(42186005)(305945005)(66066001)(559001)(579004)(569005); DIR:OUT; SFP:1101; SCL:1; SRVR:BLUPR07MB547; H:localhost.localdomain; FPR:; SPF:None; PTR:InfoNoRecords; MX:1; A:1; LANG:en; Received-SPF: None (protection.outlook.com: cavium.com does not designate permitted sender hosts) X-Microsoft-Exchange-Diagnostics: =?us-ascii?Q?1; BLUPR07MB547; 23:hHOohHg0Rhx062iLs+kWXKtRsAgcBu9k6/SPdWoYVV?= =?us-ascii?Q?AiLyT+YMkCrBelzs7J6vFSmTD0j9KKpz66DANL7S125+T3EcVTRKEkOWW3MG?= =?us-ascii?Q?zwQNUUylaFG3733vczR/1SnlT0tXmB/rghB4/J2RrkrHWi1a/CF6dXsaUoVA?= =?us-ascii?Q?k4SVKAsZjWIVwm8jl/aSzCR7klGA+WmbyqMu8A5AOZvkNuthTLfIbB0L0NL7?= =?us-ascii?Q?K+U+R7TjVFe9Y6Kx7EmhGdn4+CLT4Hg2aCKyGe23kwhKNSe39RweMHBBCarw?= =?us-ascii?Q?lexNGw2PCVrAh0Vg3UEleJ8SiwRa9cYlJ3nRSR0NGfbMVW0njCR5Ojd1L2ij?= =?us-ascii?Q?+xX61YZ+3Cw4wsZ50udjwLqsC5ehF0QStJ79+kjJ0HWrB097jwPtQDJUPxzu?= =?us-ascii?Q?QU4EHw//WOPegFCsD6e4h/yGya3epHx7CA5aoXyJjI35GDrXVkn/SUNmewRl?= =?us-ascii?Q?d+BbKnQupVeYFxSwcySOldP8F/V+HRR5E4Gf3C2ks55eS/ml0x0F1P0STHq4?= =?us-ascii?Q?KtqJ5pk4V2uYv/mlMZQdsgFa4MYY0+W7tNRvSCIaYyeOnOwZ0aNdK5l4WRp0?= =?us-ascii?Q?frqH2fd2JZ6058cV6LXmpgYc0moNpbL8djCGebHI5M52xm9ej/NS9NBtYoEh?= =?us-ascii?Q?COTIVcOSmx51iBODoaLI4T9B8NL7uRztA+ZOZjE/9UQA9zuf+11mEv0Xlekp?= =?us-ascii?Q?bzqdJwvMlg87BswmlOJG3Bo4vFH0O8AJzE+FajwlefCwop37qkmc3XwyvIdc?= =?us-ascii?Q?Rc8gtx7biBpQwwM72IX7H9haWzIlEalqh3NZTJ94zlzrjrBlKWSlwHioooF9?= =?us-ascii?Q?X/4PKfUJowJUyZwd5n9wcWtZssf+kDNpKSnPrwhutAxIYZ2Jm3W/F2hh+z5F?= =?us-ascii?Q?AhiONQJkvULjKujENixfXLLT5q8KY9pLhep5106NCSlAmcOc9t35Kozj89JH?= =?us-ascii?Q?yN8iqYrpfZAP4M6FEc1lPKdv1lnWpA7GLSh7mpyzwSt2vWtttoYmH/pUWdMc?= =?us-ascii?Q?r5ZZHrVFQyRRTSi/8uQBNNataRpGeke5fK0Kxwn/AoUGSh2vSUXd+3oXV6tr?= =?us-ascii?Q?den7Q6qFCgYYpYAlHNjHTh6f2kLfHTKlk0KvVGGo0/yMR8Q/fGhfyzrJy4zW?= =?us-ascii?Q?mE+nLZ2+8Q5OIGuwhWabUrD8l8K+AjsQ7PtCow0vz6WABwa3+oY+yO3qSZGu?= =?us-ascii?Q?dbOAHlmlmxY491AlHoz39b+iCUuKwlymxWYOggOGoj9nC660hjMiynT21IGD?= =?us-ascii?Q?kANMHBfTM7p2mVDEC6Tv/q0tGtodKVaUlivGcmXNIGAKV0kFywHkwjcfIsEx?= =?us-ascii?Q?PYrSFBM4esKoH2aRXuBDYDpY/2g+Ro4xwqKtQRxcnm8fn9DJW8ezzgwzVwvX?= =?us-ascii?Q?73Y2gsyVhGpCokMNqpLJc21mb/Ruq4pIccN9tRu7a9a+Mx?= X-Microsoft-Exchange-Diagnostics: 1; BLUPR07MB547; 6:YBWPYH3z/io1g5gKaGTKCWf8jBCk6wqjaz/ZJ22wuuVuaT1ql3p5/4xNpGC/WonSi1HAbcDxUwmE8qv0/816K1KiZsOrqhHB14ta30Ac/9oC0XzUzo18/qF502B4w0+9ABwIC0tNbx5jC26F4/wUTi2VIKExoPTDcTqtE3G+sOE3nM2JbunJC5wLUeCh2XjO6k/qU7l/dTwQbcG0kiELe8SbIna+y46Jw2qf4E+C3MEasTzfcUJNODlrKCxm9i9pGc0fmHNbBgL3qEguikHFZIdPWrCGvwkI6qIw4cvJ0iME86Kaw2v1RmtwbOd+HaN5uhP5ZYLKkxFph9TCjtatLyvsovzwbjqEOzlxufzH3vf0+59r1y+OST44zYLrCLYU22PY7YiVIDMjKeCw/C+JbwsHr7TsV/B7p15IW//QXzk=; 5:/raqLF33FYOAdMdy6cZNvFF1CO+4foYzQOqDsNsUUgjgFUG+mQX3DDmoQarf5AQW4gf8VHktjFtkiCR7TIIr9zR/QEl20QPh2znPlvv4Q9X+nA8WYZvlZAxiUMbcbyro7OdY4V0xT3N1KvdwmwmJJw==; 24:xfdFQm2ggF+w3VgiXGl9xP68vTBS26DXLDpE/aRawnlXS8haaWY42oPs2umwm6ivld28xV+MtpXUvfyQfVx2DolejLdNaJfCouxEqiCmiKw= SpamDiagnosticOutput: 1:99 SpamDiagnosticMetadata: NSPM X-Microsoft-Exchange-Diagnostics: 1; BLUPR07MB547; 7:A3uuPZef5MLpaqGU/2/32kkEP1keOG0tmC9Y0wKs7z7tnoEyrqZkL6Ic0BJh0hDhltHJ67KY2sX2BqFy+NvYX3KC+oTomnYpzAuGOsoM1fiKWDZvhNFo92nifLRwrboUSky4ftEFzANANnr5rkq0BjMFWsHnrJHiuGDb8uNSTcnZW8t544M0BCbinTMxs9t4LvJsIi2Hoq2HqClygWXSk3ehUuYy1hn2bCk2MQ+6tDbWj3tnSkSBABhyuncmz1CLt22N3Po0wLJvCxWXABKYoZ2vYerK+Wskd5vVDfZaSAUYntnlVQh/dU5PfTo39TnMI21jjbMv+ovY29Ld8AVIHUuOHkfkpktmCadh+/Tx8ViFAuIvnFMDbBTC7d1lemjZLLE+vfWAek7JsfNMdPBWKBPaiJ/Pjq3nEDA35FvcpzWY1AqkrQeLVIXdJ56utg9iLRP3Bt9sB43cu2RRSCjeEg== X-OriginatorOrg: caviumnetworks.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 06 Dec 2016 17:34:47.3052 (UTC) X-MS-Exchange-CrossTenant-FromEntityHeader: Hosted X-MS-Exchange-Transport-CrossTenantHeadersStamped: BLUPR07MB547 Subject: [dpdk-dev] [PATCH v2 04/12] crypto/armv8: Add AES+SHA256 crypto operations for ARMv8 X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 06 Dec 2016 17:34:52 -0000 From: Zbigniew Bodek This patch adds AES-128-CBC + SHA256 low-level crypto operations for ARMv8 processors. The assembly code is a base for an optimized PMD and is currently excluded from the build. This code is optimized to provide performance boost for combined operations such as encryption + HMAC generation, decryption + HMAC validation. Introduced operations add support for AES-128-CBC in combination with: SHA256 MAC, SHA256 HMAC Signed-off-by: Zbigniew Bodek Signed-off-by: Emery Davis --- drivers/crypto/armv8/asm/aes128cbc_sha256.S | 1544 ++++++++++++++++ drivers/crypto/armv8/asm/aes128cbc_sha256_hmac.S | 1879 ++++++++++++++++++++ drivers/crypto/armv8/asm/sha256_aes128cbc_dec.S | 1658 +++++++++++++++++ .../crypto/armv8/asm/sha256_hmac_aes128cbc_dec.S | 1832 +++++++++++++++++++ 4 files changed, 6913 insertions(+) create mode 100644 drivers/crypto/armv8/asm/aes128cbc_sha256.S create mode 100644 drivers/crypto/armv8/asm/aes128cbc_sha256_hmac.S create mode 100644 drivers/crypto/armv8/asm/sha256_aes128cbc_dec.S create mode 100644 drivers/crypto/armv8/asm/sha256_hmac_aes128cbc_dec.S diff --git a/drivers/crypto/armv8/asm/aes128cbc_sha256.S b/drivers/crypto/armv8/asm/aes128cbc_sha256.S new file mode 100644 index 0000000..caed87d --- /dev/null +++ b/drivers/crypto/armv8/asm/aes128cbc_sha256.S @@ -0,0 +1,1544 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Combined Enc/Auth Primitive = aes128cbc/sha256 + * + * Operations: + * + * out = encrypt-AES128CBC(in) + * return_hash_ptr = SHA256(out) + * + * Prototype: + * void aes128cbc_sha256(uint8_t *csrc, uint8_t *cdst, + * uint8_t *dsrc, uint8_t *ddst, + * uint64_t len, crypto_arg_t *arg) + * + * Registers used: + * + * aes128cbc_sha256( + * csrc, x0 (cipher src address) + * cdst, x1 (cipher dst address) + * dsrc, x2 (digest src address - ignored) + * ddst, x3 (digest dst address) + * len, x4 (length) + * arg x5 : + * arg->cipher.key (round keys) + * arg->cipher.iv (initialization vector) + * ) + * + * Routine register definitions: + * + * v0 - v3 -- aes results + * v4 - v7 -- round consts for sha + * v8 - v18 -- round keys + * v19 - v20 -- round keys + * v21 -- ABCD tmp + * v22 -- sha working state ABCD (q22) + * v23 -- sha working state EFGH (q23) + * v24 -- regShaStateABCD + * v25 -- regShaStateEFGH + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * Constraints: + * + * The variable "len" must be a multiple of 16, otherwise results + * are not defined. For AES partial blocks the user is required + * to pad the input to modulus 16 = 0. + * + * Short lengths are not optimized at < 12 AES blocks + */ + + .file "aes128cbc_sha256.S" + .text + .cpu generic+fp+simd+crypto+crc + .global aes128cbc_sha256 + .type aes128cbc_sha256,%function + + + .align 4 +.Lrcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +.Linit_sha_state: + .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +aes128cbc_sha256: +/* fetch args */ + ldr x2, [x5, #CIPHER_KEY] + ldr x5, [x5, #CIPHER_IV] + +/* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */ + /* address of sha init state consts */ + adr x12,.Linit_sha_state + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x4,4 /* aes_blocks = len/16 */ + cmp x10,12 /* no main loop if <12 */ + ld1 {v24.4s, v25.4s},[x12] /* init ABCD, EFGH. (2 cycs) */ + b.lt .Lshort_cases /* branch if < 12 */ + + /* protect registers */ + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + /* proceed */ + ld1 {v3.16b},[x5] /* get 1st ivec */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov x11,x4 /* len -> x11 needed at end */ + lsr x12,x11,6 /* total_blocks */ + +/* + * now we can do the loop prolog, 1st aes sequence of 4 blocks + */ + ld1 {v8.16b},[x2],16 /* rk[0] */ + ld1 {v9.16b},[x2],16 /* rk[1] */ + eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */ + ld1 {v10.16b},[x2],16 /* rk[2] */ + +/* aes xform 0 */ + aese v0.16b,v8.16b + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + aesmc v0.16b,v0.16b + ld1 {v11.16b},[x2],16 /* rk[3] */ + aese v0.16b,v9.16b + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon + aesmc v0.16b,v0.16b + ld1 {v12.16b},[x2],16 /* rk[4] */ + aese v0.16b,v10.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aesmc v0.16b,v0.16b + ld1 {v13.16b},[x2],16 /* rk[5] */ + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + ld1 {v14.16b},[x2],16 /* rk[6] */ + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + ld1 {v15.16b},[x2],16 /* rk[7] */ + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x2],16 /* rk[8] */ + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + ld1 {v17.16b},[x2],16 /* rk[9] */ + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + ld1 {v18.16b},[x2],16 /* rk[10] */ + aese v0.16b,v16.16b + mov x4,x1 /* sha_ptr_in = aes_ptr_out */ + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b /* res 0 */ + + eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */ + +/* aes xform 1 */ + aese v1.16b,v8.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + prfm PLDL1KEEP,[x8,0*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + prfm PLDL1KEEP,[x8,2*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + prfm PLDL1KEEP,[x8,4*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + prfm PLDL1KEEP,[x8,6*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + prfm PLDL1KEEP,[x8,8*64] /* rcon */ + eor v1.16b,v1.16b,v18.16b /* res 1 */ + + eor v2.16b,v2.16b,v1.16b /* xor w/ ivec (modeop) */ + +/* aes xform 2 */ + aese v2.16b,v8.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + mov x2,x0 /* lead_ptr = aes_ptr_in */ + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + prfm PLDL1KEEP,[x8,10*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + prfm PLDL1KEEP,[x8,12*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + prfm PLDL1KEEP,[x8,14*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + + eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */ + +/* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + sub x7,x12,1 /* main_blocks = total_blocks - 1 */ + and x13,x10,3 /* aes_blocks_left */ + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b /* res 3 */ +/* + * Note, aes_blocks_left := number after the main (sha) + * block is done. Can be 0 + */ +/* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* + * main combined loop CBC + */ +.Lmain_loop: + +/* + * because both mov, rev32 and eor have a busy cycle, + * this takes longer than it looks. + * Thats OK since there are 6 cycles before we can use + * the load anyway; so this goes as fast as it can without + * SW pipelining (too complicated given the code size) + */ + rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */ +/* next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */ +/* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + ld1 {v5.16b},[x9],16 /* key1 */ +/* + * aes xform 0, sha quad 0 + */ + aese v0.16b,v8.16b + ld1 {v6.16b},[x9],16 /* key2 */ + rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + aesmc v0.16b,v0.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aese v0.16b,v9.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aesmc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aese v0.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + /* no place to get rid of this stall */ + rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */ + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aese v0.16b,v12.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x9],16 /* key4 */ + aesmc v0.16b,v0.16b + sha256su0 v27.4s,v28.4s + aese v0.16b,v13.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aese v0.16b,v14.16b + ld1 {v5.16b},[x9],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesmc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aese v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aese v0.16b,v16.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + aesmc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aese v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + ld1 {v6.16b},[x9],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + +/* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */ + ld1 {v7.16b},[x9],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aese v1.16b,v8.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + aesmc v1.16b,v1.16b + sha256su1 v26.4s,v28.4s,v29.4s + aese v1.16b,v9.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aese v1.16b,v10.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesmc v1.16b,v1.16b + ld1 {v4.16b},[x9],16 /* key4 */ + aese v1.16b,v11.16b + ld1 {v5.16b},[x9],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + aesmc v1.16b,v1.16b + sha256h q22, q23, v6.4s + aese v1.16b,v12.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + aesmc v1.16b,v1.16b + sha256su0 v29.4s,v26.4s + aese v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aese v1.16b,v14.16b + ld1 {v7.16b},[x9],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aese v1.16b,v15.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + +/* mode op 2 */ + eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */ + +/* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aese v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aese v2.16b,v9.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x9],16 /* key4 */ + aesmc v2.16b,v2.16b + sha256su0 v27.4s,v28.4s + aese v2.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aese v2.16b,v11.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aese v2.16b,v13.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + aesmc v2.16b,v2.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aese v2.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v7.4s + aese v2.16b,v15.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x9],16 /* key6 */ + aesmc v2.16b,v2.16b + ld1 {v7.16b},[x9],16 /* key7 */ + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v2.16b,v2.16b,v18.16b /* res 2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + +/* mode op 3 */ + eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */ + +/* aes xform 3, sha quad 3 (hash only) */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aese v3.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v4.4s + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v5.4s + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v6.4s + aese v3.16b,v14.16b + sub x7,x7,1 /* dec block count */ + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v7.4s + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbnz x7,.Lmain_loop /* loop if more to do */ +/* + * epilog, process remaining aes blocks and b-2 sha block + * do this inline (no loop) to overlap with the sha part + * note there are 0-3 aes blocks left. + */ + rev32 v26.16b,v0.16b /* fix endian w0 */ + rev32 v27.16b,v1.16b /* fix endian w1 */ + rev32 v28.16b,v2.16b /* fix endian w2 */ + rev32 v29.16b,v3.16b /* fix endian w3 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + cbz x13, .Lbm2fromQ0 /* skip if none left */ + subs x14,x13,1 /* local copy of aes_blocks_left */ + +/* + * mode op 0 + * read next aes block, update aes_ptr_in + */ + ld1 {v0.16b},[x0],16 + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + +/* aes xform 0, sha quad 0 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aese v0.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + aesmc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aese v0.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aese v0.16b,v10.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su0 v27.4s,v28.4s + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aese v0.16b,v12.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aese v0.16b,v14.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ1 +/* + * mode op 1 + * read next aes block, update aes_ptr_in + */ + ld1 {v1.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + eor v1.16b,v1.16b,v0.16b /* xor w/prev value */ + +/* aes xform 1, sha quad 1 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aese v1.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesmc v1.16b,v1.16b + sha256su0 v26.4s,v27.4s + aese v1.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v4.4s + aese v1.16b,v10.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v27.4s,v28.4s + aesmc v1.16b,v1.16b + subs x14,x14,1 /* dec counter */ + aese v1.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aese v1.16b,v12.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v6.4s + aese v1.16b,v14.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aese v1.16b,v16.16b + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ2 + +/* + * mode op 2 + * read next aes block, update aes_ptr_in + */ + ld1 {v2.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */ + +/* aes xform 2, sha quad 2 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aese v2.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + aesmc v2.16b,v2.16b + sha256su0 v26.4s,v27.4s + aese v2.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aese v2.16b,v10.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su0 v27.4s,v28.4s + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aese v2.16b,v12.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aese v2.16b,v14.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* join common code at Quad 3 */ + b .Lbm2fromQ3 + +/* + * now there is the b-2 sha block before the final one. Execution takes over + * in the appropriate part of this depending on how many aes blocks were left. + * If there were none, the whole thing is executed. + */ +/* quad 0 */ +.Lbm2fromQ0: + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ +.Lbm2fromQ1: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ +.Lbm2fromQ2: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ +.Lbm2fromQ3: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + +/* + * now we can do the final block, either all padding or 1-3 aes blocks + * len in x11, aes_blocks_left in x13. should move the aes data setup of this + * to the last aes bit. + */ + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov w15,0x80 /* that's the 1 of the pad */ + lsr x12,x11,32 /* len_hi */ + and x9,x11,0xffffffff /* len_lo */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v26.b[0],w15 /* assume block 0 is dst */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x9,x9,3 /* len_lo in bits */ + eor v29.16b,v29.16b,v29.16b /* zero reg */ +/* + * places the 0x80 in the correct block, copies the appropriate data + */ + cbz x13,.Lpad100 /* no data to get */ + mov v26.16b,v0.16b + sub x14,x13,1 /* dec amount left */ + mov v27.b[0],w15 /* assume block 1 is dst */ + cbz x14,.Lpad100 /* branch if done */ + mov v27.16b,v1.16b + sub x14,x14,1 /* dec amount left */ + mov v28.b[0],w15 /* assume block 2 is dst */ + cbz x14,.Lpad100 /* branch if done */ + mov v28.16b,v2.16b + mov v29.b[3],w15 /* block 3, doesn't get rev'd */ +/* + * get the len_hi, len_lo in bits according to + * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12) + * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9) + * this is done before the if/else above + */ +.Lpad100: + mov v29.s[3],w9 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ +/* + * note that q29 is already built in the correct format, so no swap required + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + +/* + * do last sha of pad block + */ + +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + mov x9,sp + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add sp,sp,8*16 + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ +/* + * now we just have to put this into big endian and store! + */ + ld1 {v8.16b - v11.16b},[x9],4*16 + rev32 v24.16b,v24.16b /* big endian ABCD */ + ld1 {v12.16b - v15.16b},[x9] + rev32 v25.16b,v25.16b /* big endian EFGH */ + + st1 {v24.4s,v25.4s},[x3] /* save them both */ + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lshort_cases: + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + ld1 {v3.16b},[x5] /* get ivec */ + ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */ + ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */ + ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */ + adr x8,.Lrcon /* rcon */ + mov w15,0x80 /* sha padding word */ + + lsl x11,x10,4 /* len = aes_blocks*16 */ + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ +/* + * the idea in the short loop (at least 1) is to break out with the padding + * already in place excepting the final word. + */ +.Lshort_loop: + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + eor v0.16b,v0.16b,v3.16b /* xor w/prev value */ + +/* aes xform 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + /* assume this was final block */ + mov v27.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* load res to sha 0, endian swap */ + rev32 v26.16b,v0.16b + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */ + +/* aes xform 1 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + /* assume this was final block */ + mov v28.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + rev32 v27.16b,v1.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */ + +/* aes xform 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + /* assume this was final block */ + mov v29.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + rev32 v28.16b,v2.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + eor v3.16b,v3.16b,v2.16b /* xor w/prev value */ + +/* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b + + rev32 v29.16b,v3.16b /* load res to sha 0, endian swap */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* + * now we have the sha256 to do for these 4 aes blocks + */ + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + /* assume this was final block */ + mov v26.b[3],w15 + + sub x10,x10,1 /* dec num_blocks */ + cbnz x10,.Lshort_loop /* keep looping if more */ +/* + * there are between 0 and 3 aes blocks in the final sha256 blocks + */ +.Lpost_short_loop: + lsr x12,x11,32 /* len_hi */ + and x13,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x13,x13,3 /* len_lo in bits */ + + mov v29.s[3],w13 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + +/* do final block */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + mov x9,sp + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add sp,sp,8*16 + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + ld1 {v8.16b - v11.16b},[x9],4*16 + rev32 v24.16b,v24.16b /* big endian ABCD */ + ld1 {v12.16b - v15.16b},[x9] + rev32 v25.16b,v25.16b /* big endian EFGH */ + + st1 {v24.4s,v25.4s},[x3] /* save them both */ + ret + + .size aes128cbc_sha256, .-aes128cbc_sha256 diff --git a/drivers/crypto/armv8/asm/aes128cbc_sha256_hmac.S b/drivers/crypto/armv8/asm/aes128cbc_sha256_hmac.S new file mode 100644 index 0000000..499e8eb --- /dev/null +++ b/drivers/crypto/armv8/asm/aes128cbc_sha256_hmac.S @@ -0,0 +1,1879 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Combined Enc/Auth Primitive = aes128cbc/sha256_hmac + * + * Operations: + * + * out = encrypt-AES128CBC(in) + * return_hash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | out)) + * + * Prototype: + * void aes128cbc_sha256_hmac(uint8_t *csrc, uint8_t *cdst, + * uint8_t *dsrc, uint8_t *ddst, + * uint64_t len, crypto_arg_t *arg) + * + * Registers used: + * + * aes128cbc_sha256_hmac( + * csrc, x0 (cipher src address) + * cdst, x1 (cipher dst address) + * dsrc, x2 (digest src address - ignored) + * ddst, x3 (digest dst address) + * len, x4 (length) + * arg x5 : + * arg->cipher.key (round keys) + * arg->cipher.iv (initialization vector) + * arg->digest.hmac.i_key_pad (partially hashed i_key_pad) + * arg->digest.hmac.o_key_pad (partially hashed o_key_pad) + * ) + * + * Routine register definitions: + * + * v0 - v3 -- aes results + * v4 - v7 -- round consts for sha + * v8 - v18 -- round keys + * v19 - v20 -- round keys + * v21 -- ABCD tmp + * v22 -- sha working state ABCD (q22) + * v23 -- sha working state EFGH (q23) + * v24 -- sha state ABCD + * v25 -- sha state EFGH + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * Constraints: + * + * The variable "len" must be a multiple of 16, otherwise results + * are not defined. For AES partial blocks the user is required + * to pad the input to modulus 16 = 0. + * + * Short lengths are not optimized at < 12 AES blocks + */ + + .file "aes128cbc_sha256_hmac.S" + .text + .cpu generic+fp+simd+crypto+crc + .global aes128cbc_sha256_hmac + .type aes128cbc_sha256_hmac,%function + + .align 4 +.Lrcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +.Linit_sha_state: + .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +aes128cbc_sha256_hmac: +/* fetch args */ + ldr x6, [x5, #HMAC_IKEYPAD] + /* init ABCD, EFGH. */ + ld1 {v24.4s, v25.4s},[x6] + /* save pointer to o_key_pad partial hash */ + ldr x6, [x5, #HMAC_OKEYPAD] + + ldr x2, [x5, #CIPHER_KEY] + ldr x5, [x5, #CIPHER_IV] + +/* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */ + /* address of sha init state consts */ + adr x12,.Linit_sha_state + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x4,4 /* aes_blocks = len/16 */ + cmp x10,12 /* no main loop if <12 */ + b.lt .Lshort_cases /* branch if < 12 */ + + /* protect registers */ + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + +/* proceed */ + ld1 {v3.16b},[x5] /* get 1st ivec */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov x11,x4 /* len -> x11 needed at end */ + lsr x12,x11,6 /* total_blocks */ +/* + * now we can do the loop prolog, 1st aes sequence of 4 blocks + */ + ld1 {v8.16b},[x2],16 /* rk[0] */ + ld1 {v9.16b},[x2],16 /* rk[1] */ + eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */ + ld1 {v10.16b},[x2],16 /* rk[2] */ + +/* aes xform 0 */ + aese v0.16b,v8.16b + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + aesmc v0.16b,v0.16b + ld1 {v11.16b},[x2],16 /* rk[3] */ + aese v0.16b,v9.16b + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon + aesmc v0.16b,v0.16b + ld1 {v12.16b},[x2],16 /* rk[4] */ + aese v0.16b,v10.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aesmc v0.16b,v0.16b + ld1 {v13.16b},[x2],16 /* rk[5] */ + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + ld1 {v14.16b},[x2],16 /* rk[6] */ + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + ld1 {v15.16b},[x2],16 /* rk[7] */ + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x2],16 /* rk[8] */ + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + ld1 {v17.16b},[x2],16 /* rk[9] */ + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + ld1 {v18.16b},[x2],16 /* rk[10] */ + aese v0.16b,v16.16b + mov x4,x1 /* sha_ptr_in = aes_ptr_out */ + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b /* res 0 */ + + eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */ + +/* aes xform 1 */ + aese v1.16b,v8.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + prfm PLDL1KEEP,[x8,0*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + prfm PLDL1KEEP,[x8,2*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + prfm PLDL1KEEP,[x8,4*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + prfm PLDL1KEEP,[x8,6*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + prfm PLDL1KEEP,[x8,8*64] /* rcon */ + eor v1.16b,v1.16b,v18.16b /* res 1 */ + + eor v2.16b,v2.16b,v1.16b /* xor w/ ivec (modeop) */ + +/* aes xform 2 */ + aese v2.16b,v8.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + mov x2,x0 /* lead_ptr = aes_ptr_in */ + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + prfm PLDL1KEEP,[x8,10*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + prfm PLDL1KEEP,[x8,12*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + prfm PLDL1KEEP,[x8,14*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + + eor v3.16b,v3.16b,v2.16b /* xor w/ivec (modeop) */ + +/* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + sub x7,x12,1 /* main_blocks = total_blocks - 1 */ + and x13,x10,3 /* aes_blocks_left */ + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b /* res 3 */ + +/* + * Note, aes_blocks_left := number after the main (sha) + * block is done. Can be 0 + */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + +/* + * main combined loop CBC + */ +.Lmain_loop: + +/* + * because both mov, rev32 and eor have a busy cycle, + * this takes longer than it looks. Thats OK since there are 6 cycles + * before we can use the load anyway; so this goes as fast as it can without + * SW pipelining (too complicated given the code size) + */ + rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */ + /* next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + ld1 {v5.16b},[x9],16 /* key1 */ +/* + * aes xform 0, sha quad 0 + */ + aese v0.16b,v8.16b + ld1 {v6.16b},[x9],16 /* key2 */ + rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + aesmc v0.16b,v0.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aese v0.16b,v9.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aesmc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aese v0.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + /* no place to get rid of this stall */ + rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */ + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aese v0.16b,v12.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x9],16 /* key4 */ + aesmc v0.16b,v0.16b + sha256su0 v27.4s,v28.4s + aese v0.16b,v13.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aese v0.16b,v14.16b + ld1 {v5.16b},[x9],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesmc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aese v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aese v0.16b,v16.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + aesmc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aese v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + ld1 {v6.16b},[x9],16 /* key6 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + +/* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */ + ld1 {v7.16b},[x9],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aese v1.16b,v8.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + aesmc v1.16b,v1.16b + sha256su1 v26.4s,v28.4s,v29.4s + aese v1.16b,v9.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aese v1.16b,v10.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesmc v1.16b,v1.16b + ld1 {v4.16b},[x9],16 /* key4 */ + aese v1.16b,v11.16b + ld1 {v5.16b},[x9],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + aesmc v1.16b,v1.16b + sha256h q22, q23, v6.4s + aese v1.16b,v12.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + aesmc v1.16b,v1.16b + sha256su0 v29.4s,v26.4s + aese v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aese v1.16b,v14.16b + ld1 {v7.16b},[x9],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aese v1.16b,v15.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + + +/* mode op 2 */ + eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */ + +/* aes xform 2, sha quad 2 */ + + sha256su0 v26.4s,v27.4s + aese v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aese v2.16b,v9.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x9],16 /* key4 */ + aesmc v2.16b,v2.16b + sha256su0 v27.4s,v28.4s + aese v2.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aese v2.16b,v11.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aese v2.16b,v13.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + aesmc v2.16b,v2.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aese v2.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v7.4s + aese v2.16b,v15.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x9],16 /* key6 */ + aesmc v2.16b,v2.16b + ld1 {v7.16b},[x9],16 /* key7 */ + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v2.16b,v2.16b,v18.16b /* res 2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + +/* mode op 3 */ + eor v3.16b,v3.16b,v2.16b /* xor w/prev value */ + +/* aes xform 3, sha quad 3 (hash only) */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aese v3.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v4.4s + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v5.4s + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v6.4s + aese v3.16b,v14.16b + sub x7,x7,1 /* dec block count */ + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v3.16b,v3.16b + sha256h2 q23, q21, v7.4s + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbnz x7,.Lmain_loop /* loop if more to do */ + +/* + * epilog, process remaining aes blocks and b-2 sha block + * do this inline (no loop) to overlap with the sha part + * note there are 0-3 aes blocks left. + */ + rev32 v26.16b,v0.16b /* fix endian w0 */ + rev32 v27.16b,v1.16b /* fix endian w1 */ + rev32 v28.16b,v2.16b /* fix endian w2 */ + rev32 v29.16b,v3.16b /* fix endian w3 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + cbz x13, .Lbm2fromQ0 /* skip if none left */ + subs x14,x13,1 /* local copy of aes_blocks_left */ +/* + * mode op 0 + * read next aes block, update aes_ptr_in + */ + ld1 {v0.16b},[x0],16 + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + +/* aes xform 0, sha quad 0 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aese v0.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + aesmc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aese v0.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aese v0.16b,v10.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su0 v27.4s,v28.4s + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aese v0.16b,v12.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aese v0.16b,v14.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ1 +/* + * mode op 1 + * read next aes block, update aes_ptr_in + */ + ld1 {v1.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + eor v1.16b,v1.16b,v0.16b /* xor w/prev value */ + +/* aes xform 1, sha quad 1 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aese v1.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesmc v1.16b,v1.16b + sha256su0 v26.4s,v27.4s + aese v1.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v4.4s + aese v1.16b,v10.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v27.4s,v28.4s + aesmc v1.16b,v1.16b + subs x14,x14,1 /* dec counter */ + aese v1.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aese v1.16b,v12.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v6.4s + aese v1.16b,v14.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aese v1.16b,v16.16b + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ2 +/* + * mode op 2 + * read next aes block, update aes_ptr_in + */ + ld1 {v2.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + eor v2.16b,v2.16b,v1.16b /* xor w/prev value */ + +/* aes xform 2, sha quad 2 */ + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aese v2.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + aesmc v2.16b,v2.16b + sha256su0 v26.4s,v27.4s + aese v2.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aese v2.16b,v10.16b + sha256su1 v26.4s,v28.4s,v29.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256su0 v27.4s,v28.4s + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aese v2.16b,v12.16b + sha256su1 v27.4s,v29.4s,v26.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesmc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aese v2.16b,v14.16b + sha256su1 v28.4s,v26.4s,v27.4s + sha256su0 v29.4s,v26.4s + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* join common code at Quad 3 */ + b .Lbm2fromQ3 +/* + * now there is the b-2 sha block before the final one. Execution takes over + * in the appropriate part of this depending on how many aes blocks were left. + * If there were none, the whole thing is executed. + */ +/* quad 0 */ +.Lbm2fromQ0: + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ +.Lbm2fromQ1: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ +.Lbm2fromQ2: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ +.Lbm2fromQ3: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + +/* + * now we can do the final block, either all padding or 1-3 aes blocks + * len in x11, aes_blocks_left in x13. should move the aes data setup of this + * to the last aes bit. + */ + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov w15,0x80 /* that's the 1 of the pad */ + /* Add one SHA-2 block since hash is calculated including i_key_pad */ + add x11, x11, #64 + lsr x12,x11,32 /* len_hi */ + and x9,x11,0xffffffff /* len_lo */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov v26.b[0],w15 /* assume block 0 is dst */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x9,x9,3 /* len_lo in bits */ + eor v29.16b,v29.16b,v29.16b /* zero reg */ +/* + * places the 0x80 in the correct block, copies the appropriate data + */ + cbz x13,.Lpad100 /* no data to get */ + mov v26.16b,v0.16b + sub x14,x13,1 /* dec amount left */ + mov v27.b[0],w15 /* assume block 1 is dst */ + cbz x14,.Lpad100 /* branch if done */ + mov v27.16b,v1.16b + sub x14,x14,1 /* dec amount left */ + mov v28.b[0],w15 /* assume block 2 is dst */ + cbz x14,.Lpad100 /* branch if done */ + mov v28.16b,v2.16b + mov v29.b[3],w15 /* block 3, doesn't get rev'd */ +/* + * get the len_hi,LenLo in bits according to + * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12) + * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9) + * this is done before the if/else above + */ +.Lpad100: + mov v29.s[3],w9 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ +/* + * note that q29 is already built in the correct format, so no swap required + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ +/* + * do last sha of pad block + */ +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v26.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v27.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* base address for sha round consts */ + adr x8,.Lrcon + /* load o_key_pad partial hash */ + ld1 {v24.16b,v25.16b}, [x6] + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v28.b[3], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+32 + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + ld1 {v6.16b},[x8],16 /* key2 */ + ld1 {v7.16b},[x8],16 /* key3 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key8 */ + ld1 {v5.16b},[x8],16 /* key9 */ + ld1 {v6.16b},[x8],16 /* key10 */ + ld1 {v7.16b},[x8],16 /* key11 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key12 */ + ld1 {v5.16b},[x8],16 /* key13 */ + ld1 {v6.16b},[x8],16 /* key14 */ + ld1 {v7.16b},[x8],16 /* key15 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + st1 {v24.4s,v25.4s},[x3] /* save them both */ + + mov x9,sp + add sp,sp,8*16 + ld1 {v8.16b - v11.16b},[x9],4*16 + ld1 {v12.16b - v15.16b},[x9] + + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lshort_cases: + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + ld1 {v3.16b},[x5] /* get ivec */ + ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */ + ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */ + ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */ + adr x8,.Lrcon /* rcon */ + mov w15,0x80 /* sha padding word */ + + lsl x11,x10,4 /* len = aes_blocks*16 */ + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ +/* + * the idea in the short loop (at least 1) is to break out with the padding + * already in place excepting the final word. + */ +.Lshort_loop: + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + eor v0.16b,v0.16b,v3.16b /* xor w/prev value */ + +/* aes xform 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + /* assume this was final block */ + mov v27.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + rev32 v26.16b,v0.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + eor v1.16b,v1.16b,v0.16b /* xor w/prev value */ + +/* aes xform 1 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + /* assume this was final block */ + mov v28.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + rev32 v27.16b,v1.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + eor v2.16b,v2.16b,v1.16b /* xor w/prev value */ + +/* aes xform 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + /* assume this was final block */ + mov v29.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + rev32 v28.16b,v2.16b /* load res to sha 0, endian swap */ + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */ + +/* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b + + rev32 v29.16b,v3.16b /* load res to sha 0, endian swap */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* + * now we have the sha256 to do for these 4 aes blocks + */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + /* assume this was final block */ + mov v26.b[3],w15 + + sub x10,x10,1 /* dec num_blocks */ + cbnz x10,.Lshort_loop /* keep looping if more */ +/* + * there are between 0 and 3 aes blocks in the final sha256 blocks + */ +.Lpost_short_loop: + /* Add one SHA-2 block since hash is calculated including i_key_pad */ + add x11, x11, #64 + lsr x12,x11,32 /* len_hi */ + and x13,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x13,x13,3 /* len_lo in bits */ + + mov v29.s[3],w13 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + +/* do final block */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v26.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v27.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* base address for sha round consts */ + adr x8,.Lrcon + /* load o_key_pad partial hash */ + ld1 {v24.16b,v25.16b}, [x6] + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v28.b[3], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+32 + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + ld1 {v6.16b},[x8],16 /* key2 */ + ld1 {v7.16b},[x8],16 /* key3 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key8 */ + ld1 {v5.16b},[x8],16 /* key9 */ + ld1 {v6.16b},[x8],16 /* key10 */ + ld1 {v7.16b},[x8],16 /* key11 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key12 */ + ld1 {v5.16b},[x8],16 /* key13 */ + ld1 {v6.16b},[x8],16 /* key14 */ + ld1 {v7.16b},[x8],16 /* key15 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + st1 {v24.4s,v25.4s},[x3] /* save them both */ + + mov x9,sp + add sp,sp,8*16 + ld1 {v8.16b - v11.16b},[x9],4*16 + ld1 {v12.16b - v15.16b},[x9] + + ret + + .size aes128cbc_sha256_hmac, .-aes128cbc_sha256_hmac diff --git a/drivers/crypto/armv8/asm/sha256_aes128cbc_dec.S b/drivers/crypto/armv8/asm/sha256_aes128cbc_dec.S new file mode 100644 index 0000000..e33c77b --- /dev/null +++ b/drivers/crypto/armv8/asm/sha256_aes128cbc_dec.S @@ -0,0 +1,1658 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Combined Auth/Dec Primitive = sha256/aes128cbc + * + * Operations: + * + * out = decrypt-AES128CBC(in) + * return_ash_ptr = SHA256(in) + * + * Prototype: + * + * void sha256_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst, + * uint8_t *dsrc, uint8_t *ddst, + * uint64_t len, crypto_arg_t *arg) + * + * Registers used: + * + * sha256_aes128cbc_dec( + * csrc, x0 (cipher src address) + * cdst, x1 (cipher dst address) + * dsrc, x2 (digest src address - ignored) + * ddst, x3 (digest dst address) + * len, x4 (length) + * arg x5 : + * arg->cipher.key (round keys) + * arg->cipher.iv (initialization vector) + * ) + * + * Routine register definitions: + * + * v0 - v3 -- aes results + * v4 - v7 -- round consts for sha + * v8 - v18 -- round keys + * v19 - v20 -- round keys + * v21 -- ABCD tmp + * v22 -- sha working state ABCD (q22) + * v23 -- sha working state EFGH (q23) + * v24 -- regShaStateABCD + * v25 -- regShaStateEFGH + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * + * Constraints: + * + * The variable "len" must be a multiple of 16, + * otherwise results are not defined. For AES partial blocks the user + * is required to pad the input to modulus 16 = 0. + * + * Short lengths are less optimized at < 16 AES blocks, + * however they are somewhat optimized, and more so than the enc/auth versions. + */ + .file "sha256_aes128cbc_dec.S" + .text + .cpu generic+fp+simd+crypto+crc + .global sha256_aes128cbc_dec + .type sha256_aes128cbc_dec,%function + + + .align 4 +.Lrcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +.Linit_sha_state: + .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +sha256_aes128cbc_dec: +/* fetch args */ + ldr x2, [x5, #CIPHER_KEY] + ldr x5, [x5, #CIPHER_IV] +/* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + prfm PLDL1KEEP,[x0,0] /* pref next *in */ + /* address of sha init state consts */ + adr x12,.Linit_sha_state + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x4,4 /* aes_blocks = len/16 */ + cmp x10,16 /* no main loop if <16 */ + ld1 {v24.4s, v25.4s},[x12] /* init ABCD, EFGH. (2 cycs) */ + blt .Lshort_cases /* branch if < 12 */ + +/* protect registers */ + sub sp,sp,8*16 + mov x11,x4 /* len -> x11 needed at end */ + mov x7,sp /* copy for address mode */ + ld1 {v30.16b},[x5] /* get 1st ivec */ + lsr x12,x11,6 /* total_blocks (sha) */ + mov x4,x0 /* sha_ptr_in = *in */ + ld1 {v26.16b},[x4],16 /* next w0 */ + ld1 {v27.16b},[x4],16 /* next w1 */ + ld1 {v28.16b},[x4],16 /* next w2 */ + ld1 {v29.16b},[x4],16 /* next w3 */ + +/* + * now we can do the loop prolog, 1st sha256 block + */ + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon +/* + * do the first sha256 block on the plaintext + */ + mov v22.16b,v24.16b /* init working ABCD */ + st1 {v8.16b},[x7],16 + mov v23.16b,v25.16b /* init working EFGH */ + st1 {v9.16b},[x7],16 + + rev32 v26.16b,v26.16b /* endian swap w0 */ + st1 {v10.16b},[x7],16 + rev32 v27.16b,v27.16b /* endian swap w1 */ + st1 {v11.16b},[x7],16 + rev32 v28.16b,v28.16b /* endian swap w2 */ + st1 {v12.16b},[x7],16 + rev32 v29.16b,v29.16b /* endian swap w3 */ + st1 {v13.16b},[x7],16 +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + st1 {v14.16b},[x7],16 + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + st1 {v15.16b},[x7],16 + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v8.16b},[x2],16 /* rk[0] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v9.16b},[x2],16 /* rk[1] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v10.16b},[x2],16 /* rk[2] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + ld1 {v11.16b},[x2],16 /* rk[3] */ + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v12.16b},[x2],16 /* rk[4] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v13.16b},[x2],16 /* rk[5] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v14.16b},[x2],16 /* rk[6] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + ld1 {v15.16b},[x2],16 /* rk[7] */ + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v16.16b},[x2],16 /* rk[8] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v17.16b},[x2],16 /* rk[9] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v18.16b},[x2],16 /* rk[10] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v4.4s + ld1 {v26.16b},[x4],16 /* next w0 */ + ld1 {v27.16b},[x4],16 /* next w1 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v5.4s + ld1 {v28.16b},[x4],16 /* next w2 */ + ld1 {v29.16b},[x4],16 /* next w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + +/* + * aes_blocks_left := number after the main (sha) block is done. + * can be 0 note we account for the extra unwind in main_blocks + */ + sub x7,x12,2 /* main_blocks=total_blocks-5 */ + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + and x13,x10,3 /* aes_blocks_left */ + ld1 {v0.16b},[x0] /* next aes block, no update */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + add x2,x0,128 /* lead_ptr = *in */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + +/* + * main combined loop CBC, can be used by auth/enc version + */ +.Lmain_loop: + +/* + * Because both mov, rev32 and eor have a busy cycle, + * this takes longer than it looks. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov x9,x8 /* top of rcon */ + +/* + * aes xform 0, sha quad 0 + */ + aesd v0.16b,v8.16b + ld1 {v4.16b},[x9],16 /* key0 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + aesimc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aesd v0.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + ld1 {v6.16b},[x9],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v7.16b},[x9],16 /* key3 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aesd v0.16b,v12.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v0.16b,v0.16b + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + ld1 {v5.16b},[x9],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aesd v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v16.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aesd v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + ld1 {v6.16b},[x9],16 /* key6 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* get next aes block, with update */ + ld1 {v30.16b},[x0],16 + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + +/* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + ld1 {v7.16b},[x9],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + aesimc v1.16b,v1.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesd v1.16b,v9.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aesd v1.16b,v10.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v1.16b,v1.16b + ld1 {v4.16b},[x9],16 /* key4 */ + aesd v1.16b,v11.16b + ld1 {v5.16b},[x9],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + aesimc v1.16b,v1.16b + sha256h q22, q23, v6.4s + aesd v1.16b,v12.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v1.16b,v1.16b + sha256su0 v29.4s,v26.4s + aesd v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aesd v1.16b,v14.16b + ld1 {v7.16b},[x9],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesimc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aesd v1.16b,v15.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + +/* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aesd v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aesd v2.16b,v9.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v2.16b,v2.16b + sha256su0 v27.4s,v28.4s + aesd v2.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aesd v2.16b,v11.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aesd v2.16b,v13.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v7.4s + aesd v2.16b,v15.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x9],16 /* key6 */ + aesimc v2.16b,v2.16b + ld1 {v7.16b},[x9],16 /* key7 */ + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v2.16b,v2.16b,v18.16b /* res 2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + +/* aes xform 3, sha quad 3 (hash only) */ + + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + ld1 {v26.16b},[x4],16 /* next w0 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v4.4s + aesd v3.16b,v10.16b + ld1 {v27.16b},[x4],16 /* next w1 */ + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + ld1 {v28.16b},[x4],16 /* next w2 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v5.4s + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + ld1 {v29.16b},[x4],16 /* next w3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v6.4s + aesd v3.16b,v14.16b + sub x7,x7,1 /* dec block count */ + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + ld1 {v0.16b},[x0] /* next aes block, no update */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v7.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + eor v3.16b,v3.16b,v31.16b /* xor w/prev value */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbnz x7,.Lmain_loop /* loop if more to do */ +/* + * now the loop epilog. Since the reads for sha have already been done + * in advance, we have to have an extra unwind. + * This is why the test for the short cases is 16 and not 12. + * + * the unwind, which is just the main loop without the tests or final reads. + */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ +/* + * aes xform 0, sha quad 0 + */ + aesd v0.16b,v8.16b + ld1 {v6.16b},[x9],16 /* key2 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + aesimc v0.16b,v0.16b + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesd v0.16b,v9.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aesimc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aesd v0.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aesd v0.16b,v12.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v0.16b,v0.16b + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + ld1 {v5.16b},[x9],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aesd v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v16.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aesd v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + ld1 {v6.16b},[x9],16 /* key6 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + +/* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + ld1 {v7.16b},[x9],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + aesimc v1.16b,v1.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesd v1.16b,v9.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aesd v1.16b,v10.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v1.16b,v1.16b + ld1 {v4.16b},[x9],16 /* key4 */ + aesd v1.16b,v11.16b + ld1 {v5.16b},[x9],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + aesimc v1.16b,v1.16b + sha256h q22, q23, v6.4s + aesd v1.16b,v12.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v1.16b,v1.16b + sha256su0 v29.4s,v26.4s + aesd v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aesd v1.16b,v14.16b + ld1 {v7.16b},[x9],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesimc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aesd v1.16b,v15.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + +/* mode op 2 */ + +/* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aesd v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aesd v2.16b,v9.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v2.16b,v2.16b + sha256su0 v27.4s,v28.4s + aesd v2.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aesd v2.16b,v11.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aesd v2.16b,v13.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v7.4s + aesd v2.16b,v15.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x9],16 /* key6 */ + aesimc v2.16b,v2.16b + ld1 {v7.16b},[x9],16 /* key7 */ + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v2.16b,v2.16b,v18.16b /* res 2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + +/* mode op 3 */ + +/* aes xform 3, sha quad 3 (hash only) */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v4.4s + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v5.4s + aesd v3.16b,v12.16b + /* read first aes block, no bump */ + ld1 {v0.16b},[x0] + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v6.4s + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v7.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + +/* + * now we have to do the 4 aes blocks (b-2) that catch up to where sha is + */ + +/* aes xform 0 */ + aesd v0.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b /* res 0 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + +/* aes xform 1 */ + aesd v1.16b,v8.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b /* res 1 */ + eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + +/* aes xform 2 */ + aesd v2.16b,v8.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + +/* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b /* res 3 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* + * Now, there is the final b-1 sha256 padded block. + * This contains between 0-3 aes blocks. We take some pains to avoid read spill + * by only reading the blocks that are actually defined. + * this is also the final sha block code for the short_cases. + */ +.Ljoin_common: + mov w15,0x80 /* that's the 1 of the pad */ + cbnz x13,.Lpad100 /* branch if there is some real data */ + eor v26.16b,v26.16b,v26.16b /* zero the rest */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v26.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad100: + sub x14,x13,1 /* dec amount left */ + ld1 {v26.16b},[x4],16 /* next w0 */ + cbnz x14,.Lpad200 /* branch if there is some real data */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v27.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad200: + sub x14,x14,1 /* dec amount left */ + ld1 {v27.16b},[x4],16 /* next w1 */ + cbnz x14,.Lpad300 /* branch if there is some real data */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v28.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad300: + ld1 {v28.16b},[x4],16 /* next w2 */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v29.b[3],w15 /* all data is bogus */ + +.Lpad_done: + lsr x12,x11,32 /* len_hi */ + and x14,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x14,x14,3 /* len_lo in bits */ + + mov v29.s[3],w14 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ +/* + * final sha block + * the strategy is to combine the 0-3 aes blocks, which is faster but + * a little gourmand on code space. + */ + cbz x13,.Lzero_aes_blocks_left /* none to do */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v31.16b},[x0],16 + + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + aesd v0.16b,v8.16b + ld1 {v7.16b},[x9],16 /* key3 */ + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aesd v0.16b,v10.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + aesimc v0.16b,v0.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + aesd v0.16b,v11.16b + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v4.4s + aesd v0.16b,v12.16b + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + aesimc v0.16b,v0.16b + + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v6.4s + aesd v0.16b,v15.16b + sha256h2 q23, q21, v6.4s + aesimc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + + aesd v0.16b,v16.16b + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v7.4s + aesd v0.16b,v17.16b + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */ + + sub x13,x13,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad1 + +/* aes xform 1 */ + + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v30.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aesd v0.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v0.16b,v0.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + aesd v0.16b,v9.16b + sha256su0 v26.4s,v27.4s + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + aesd v0.16b,v10.16b + sha256h q22, q23, v4.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aesd v0.16b,v11.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesimc v0.16b,v0.16b + + sha256su0 v27.4s,v28.4s + aesd v0.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v13.16b + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + aesimc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aesd v0.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v15.16b + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v0.16b,v0.16b + + sha256su0 v29.4s,v26.4s + aesd v0.16b,v16.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v7.4s + aesd v0.16b,v17.16b + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + + sub x13,x13,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad2 + +/* aes xform 2 */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aesd v0.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v0.16b,v0.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + aesd v0.16b,v9.16b + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v4.4s + aesd v0.16b,v10.16b + sha256h2 q23, q21, v4.4s + aesimc v0.16b,v0.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesd v0.16b,v11.16b + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v5.4s + aesd v0.16b,v12.16b + sha256h2 q23, q21, v5.4s + aesimc v0.16b,v0.16b + sha256su1 v27.4s,v29.4s,v26.4s + aesd v0.16b,v13.16b + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + aesd v0.16b,v14.16b + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v15.16b + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v0.16b,v0.16b + + aesd v0.16b,v16.16b + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aesd v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + b .Lfrmquad3 +/* + * the final block with no aes component, i.e from here there were zero blocks + */ + +.Lzero_aes_blocks_left: + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ +.Lfrmquad1: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ +.Lfrmquad2: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ +.Lfrmquad3: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + +/* + * now we just have to put this into big endian and store! and clean up stack... + */ + mov x9,sp + add sp,sp,8*16 + ld1 {v8.16b - v11.16b},[x9],4*16 + rev32 v24.16b,v24.16b /* big endian ABCD */ + ld1 {v12.16b - v15.16b},[x9] + rev32 v25.16b,v25.16b /* big endian EFGH */ + + st1 {v24.4s,v25.4s},[x3] /* save them both */ + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lshort_cases: + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + ld1 {v30.16b},[x5] /* get ivec */ + ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */ + ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */ + ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */ + adr x8,.Lrcon /* rcon */ + lsl x11,x10,4 /* len = aes_blocks*16 */ + mov x4,x0 /* sha_ptr_in = in */ + +/* + * This loop does 4 at a time, so that at the end there is a final sha block + * and 0-3 aes blocks. Note that everything is done serially + * to avoid complication. + */ +.Lshort_loop: + cmp x10,4 /* check if 4 or more */ + /* if less, bail to last block */ + blt .Llast_sha_block + + ld1 {v31.16b},[x4] /* next w no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x4],16 + rev32 v26.16b,v0.16b /* endian swap for sha */ + add x0,x0,64 + +/* aes xform 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + + ld1 {v30.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x4],16 + rev32 v27.16b,v1.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + +/* aes xform 1 */ + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */ + + ld1 {v31.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x4],16 + rev32 v28.16b,v2.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + +/* aes xform 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + eor v2.16b,v2.16b,v30.16b /* xor w/prev value */ + + ld1 {v30.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x4],16 + rev32 v29.16b,v3.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + +/* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b + eor v3.16b,v3.16b,v31.16b /* xor w/prev value */ + +/* + * now we have the sha256 to do for these 4 aes blocks. Note that. + */ + + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + ld1 {v5.16b},[x9],16 /* key1 */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + +/* quad 0 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + sub x10,x10,4 /* 4 less */ + b .Lshort_loop /* keep looping */ +/* + * this is arranged so that we can join the common unwind code that does + * the last sha block and the final 0-3 aes blocks + */ +.Llast_sha_block: + mov x13,x10 /* copy aes blocks for common */ + b .Ljoin_common /* join common code */ + + .size sha256_aes128cbc_dec, .-sha256_aes128cbc_dec diff --git a/drivers/crypto/armv8/asm/sha256_hmac_aes128cbc_dec.S b/drivers/crypto/armv8/asm/sha256_hmac_aes128cbc_dec.S new file mode 100644 index 0000000..4ca34c1 --- /dev/null +++ b/drivers/crypto/armv8/asm/sha256_hmac_aes128cbc_dec.S @@ -0,0 +1,1832 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Combined Auth/Dec Primitive = sha256_hmac/aes128cbc + * + * Operations: + * + * out = decrypt-AES128CBC(in) + * return_ash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | in)) + * + * Prototype: + * + * void sha256_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst, + * uint8_t *dsrc, uint8_t *ddst, + * uint64_t len, crypto_arg_t *arg) + * + * Registers used: + * + * sha256_hmac_aes128cbc_dec( + * csrc, x0 (cipher src address) + * cdst, x1 (cipher dst address) + * dsrc, x2 (digest src address - ignored) + * ddst, x3 (digest dst address) + * len, x4 (length) + * arg x5 : + * arg->cipher.key (round keys) + * arg->cipher.iv (initialization vector) + * arg->digest.hmac.i_key_pad (partially hashed i_key_pad) + * arg->digest.hmac.o_key_pad (partially hashed o_key_pad) + * ) + * + * Routine register definitions: + * + * v0 - v3 -- aes results + * v4 - v7 -- round consts for sha + * v8 - v18 -- round keys + * v19 - v20 -- round keys + * v21 -- ABCD tmp + * v22 -- sha working state ABCD (q22) + * v23 -- sha working state EFGH (q23) + * v24 -- sha state ABCD + * v25 -- sha state EFGH + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * + * Constraints: + * + * The variable "len" must be a multiple of 16, + * otherwise results are not defined. For AES partial blocks the user + * is required to pad the input to modulus 16 = 0. + * + * Short lengths are less optimized at < 16 AES blocks, + * however they are somewhat optimized, and more so than the enc/auth versions. + */ + .file "sha256_hmac_aes128cbc_dec.S" + .text + .cpu generic+fp+simd+crypto+crc + .global sha256_hmac_aes128cbc_dec + .type sha256_hmac_aes128cbc_dec,%function + + + .align 4 +.Lrcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +.Linit_sha_state: + .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +sha256_hmac_aes128cbc_dec: +/* fetch args */ + ldr x6, [x5, #HMAC_IKEYPAD] + /* init ABCD, EFGH */ + ld1 {v24.4s, v25.4s},[x6] + /* save pointer to o_key_pad partial hash */ + ldr x6, [x5, #HMAC_OKEYPAD] + + ldr x2, [x5, #CIPHER_KEY] + ldr x5, [x5, #CIPHER_IV] +/* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + prfm PLDL1KEEP,[x0,0] /* pref next *in */ + /* address of sha init state consts */ + adr x12,.Linit_sha_state + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x4,4 /* aes_blocks = len/16 */ + cmp x10,16 /* no main loop if <16 */ + blt .Lshort_cases /* branch if < 12 */ + + /* protect registers */ + sub sp,sp,8*16 + mov x11,x4 /* len -> x11 needed at end */ + mov x7,sp /* copy for address mode */ + ld1 {v30.16b},[x5] /* get 1st ivec */ + lsr x12,x11,6 /* total_blocks (sha) */ + mov x4,x0 /* sha_ptr_in = *in */ + ld1 {v26.16b},[x4],16 /* next w0 */ + ld1 {v27.16b},[x4],16 /* next w1 */ + ld1 {v28.16b},[x4],16 /* next w2 */ + ld1 {v29.16b},[x4],16 /* next w3 */ + +/* + * now we can do the loop prolog, 1st sha256 block + */ + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon +/* + * do the first sha256 block on the plaintext + */ + + mov v22.16b,v24.16b /* init working ABCD */ + st1 {v8.16b},[x7],16 + mov v23.16b,v25.16b /* init working EFGH */ + st1 {v9.16b},[x7],16 + + rev32 v26.16b,v26.16b /* endian swap w0 */ + st1 {v10.16b},[x7],16 + rev32 v27.16b,v27.16b /* endian swap w1 */ + st1 {v11.16b},[x7],16 + rev32 v28.16b,v28.16b /* endian swap w2 */ + st1 {v12.16b},[x7],16 + rev32 v29.16b,v29.16b /* endian swap w3 */ + st1 {v13.16b},[x7],16 +/* quad 0 */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + st1 {v14.16b},[x7],16 + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + st1 {v15.16b},[x7],16 + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v8.16b},[x2],16 /* rk[0] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v9.16b},[x2],16 /* rk[1] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v10.16b},[x2],16 /* rk[2] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + ld1 {v11.16b},[x2],16 /* rk[3] */ + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v12.16b},[x2],16 /* rk[4] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v13.16b},[x2],16 /* rk[5] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v14.16b},[x2],16 /* rk[6] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + ld1 {v15.16b},[x2],16 /* rk[7] */ + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + ld1 {v16.16b},[x2],16 /* rk[8] */ + sha256h2 q23, q21, v4.4s + ld1 {v4.16b},[x9],16 /* key4 */ + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v17.16b},[x2],16 /* rk[9] */ + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + ld1 {v18.16b},[x2],16 /* rk[10] */ + sha256h2 q23, q21, v5.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h2 q23, q21, v7.4s + ld1 {v7.16b},[x9],16 /* key7 */ + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h2 q23, q21, v4.4s + ld1 {v26.16b},[x4],16 /* next w0 */ + ld1 {v27.16b},[x4],16 /* next w1 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v5.4s + ld1 {v28.16b},[x4],16 /* next w2 */ + ld1 {v29.16b},[x4],16 /* next w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + +/* + * aes_blocks_left := number after the main (sha) block is done. + * can be 0 note we account for the extra unwind in main_blocks + */ + sub x7,x12,2 /* main_blocks=total_blocks-5 */ + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + and x13,x10,3 /* aes_blocks_left */ + ld1 {v0.16b},[x0] /* next aes block, no update */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + add x2,x0,128 /* lead_ptr = *in */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + +/* + * main combined loop CBC, can be used by auth/enc version + */ +.Lmain_loop: + +/* + * Because both mov, rev32 and eor have a busy cycle, this takes longer + * than it looks. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov x9,x8 /* top of rcon */ + +/* + * aes xform 0, sha quad 0 + */ + aesd v0.16b,v8.16b + ld1 {v4.16b},[x9],16 /* key0 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + aesimc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aesd v0.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + ld1 {v6.16b},[x9],16 /* key2 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v7.16b},[x9],16 /* key3 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aesd v0.16b,v12.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v0.16b,v0.16b + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + ld1 {v5.16b},[x9],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aesd v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v16.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aesd v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + ld1 {v6.16b},[x9],16 /* key6 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* get next aes block, with update */ + ld1 {v30.16b},[x0],16 + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + +/* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + ld1 {v7.16b},[x9],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + aesimc v1.16b,v1.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesd v1.16b,v9.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aesd v1.16b,v10.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v1.16b,v1.16b + ld1 {v4.16b},[x9],16 /* key4 */ + aesd v1.16b,v11.16b + ld1 {v5.16b},[x9],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + aesimc v1.16b,v1.16b + sha256h q22, q23, v6.4s + aesd v1.16b,v12.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v1.16b,v1.16b + sha256su0 v29.4s,v26.4s + aesd v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aesd v1.16b,v14.16b + ld1 {v7.16b},[x9],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesimc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aesd v1.16b,v15.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + +/* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aesd v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aesd v2.16b,v9.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v2.16b,v2.16b + sha256su0 v27.4s,v28.4s + aesd v2.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aesd v2.16b,v11.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aesd v2.16b,v13.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v7.4s + aesd v2.16b,v15.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x9],16 /* key6 */ + aesimc v2.16b,v2.16b + ld1 {v7.16b},[x9],16 /* key7 */ + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v2.16b,v2.16b,v18.16b /* res 2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + +/* aes xform 3, sha quad 3 (hash only) */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + ld1 {v26.16b},[x4],16 /* next w0 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v4.4s + aesd v3.16b,v10.16b + ld1 {v27.16b},[x4],16 /* next w1 */ + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + ld1 {v28.16b},[x4],16 /* next w2 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v5.4s + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + ld1 {v29.16b},[x4],16 /* next w3 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v6.4s + aesd v3.16b,v14.16b + sub x7,x7,1 /* dec block count */ + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + ld1 {v0.16b},[x0] /* next aes block, no update */ + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v7.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbnz x7,.Lmain_loop /* loop if more to do */ +/* + * Now the loop epilog. Since the reads for sha have already been done + * in advance, we have to have an extra unwind. + * This is why the test for the short cases is 16 and not 12. + * + * the unwind, which is just the main loop without the tests or final reads. + */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + +/* + * aes xform 0, sha quad 0 + */ + aesd v0.16b,v8.16b + ld1 {v6.16b},[x9],16 /* key2 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + aesimc v0.16b,v0.16b + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesd v0.16b,v9.16b + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aesimc v0.16b,v0.16b + sha256su0 v26.4s,v27.4s + aesd v0.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aesd v0.16b,v12.16b + sha256su1 v26.4s,v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v0.16b,v0.16b + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + sha256h q22, q23, v5.4s + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + ld1 {v5.16b},[x9],16 /* key5 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aesd v0.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v16.16b + sha256su1 v28.4s,v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aesd v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + ld1 {v6.16b},[x9],16 /* key6 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + +/* aes xform 1, sha quad 1 */ + sha256su0 v26.4s,v27.4s + ld1 {v7.16b},[x9],16 /* key7 */ + mov v21.16b, v22.16b /* copy abcd */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + sha256h q22, q23, v4.4s + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h2 q23, q21, v4.4s + aesimc v1.16b,v1.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesd v1.16b,v9.16b + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v5.4s + aesd v1.16b,v10.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su1 v27.4s,v29.4s,v26.4s + aesimc v1.16b,v1.16b + ld1 {v4.16b},[x9],16 /* key4 */ + aesd v1.16b,v11.16b + ld1 {v5.16b},[x9],16 /* key5 */ + mov v21.16b, v22.16b /* copy abcd */ + sha256su0 v28.4s,v29.4s + aesimc v1.16b,v1.16b + sha256h q22, q23, v6.4s + aesd v1.16b,v12.16b + sha256h2 q23, q21, v6.4s + ld1 {v6.16b},[x9],16 /* key6 */ + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v1.16b,v1.16b + sha256su0 v29.4s,v26.4s + aesd v1.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v1.16b,v1.16b + sha256h2 q23, q21, v7.4s + aesd v1.16b,v14.16b + ld1 {v7.16b},[x9],16 /* key7 */ + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256su1 v29.4s,v27.4s,v28.4s + aesimc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aesd v1.16b,v15.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + +/* mode op 2 */ + +/* aes xform 2, sha quad 2 */ + sha256su0 v26.4s,v27.4s + aesd v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v4.4s + aesd v2.16b,v9.16b + sha256su1 v26.4s,v28.4s,v29.4s + ld1 {v4.16b},[x9],16 /* key4 */ + aesimc v2.16b,v2.16b + sha256su0 v27.4s,v28.4s + aesd v2.16b,v10.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v5.4s + aesd v2.16b,v11.16b + sha256su1 v27.4s,v29.4s,v26.4s + ld1 {v5.16b},[x9],16 /* key5 */ + sha256su0 v28.4s,v29.4s + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v6.4s + aesd v2.16b,v13.16b + sha256su1 v28.4s,v26.4s,v27.4s + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256su0 v29.4s,v26.4s + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v2.16b,v2.16b + sha256h2 q23, q21, v7.4s + aesd v2.16b,v15.16b + sha256su1 v29.4s,v27.4s,v28.4s + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + ld1 {v6.16b},[x9],16 /* key6 */ + aesimc v2.16b,v2.16b + ld1 {v7.16b},[x9],16 /* key7 */ + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + eor v2.16b,v2.16b,v18.16b /* res 2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + +/* mode op 3 */ + +/* aes xform 3, sha quad 3 (hash only) */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v4.4s + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v5.4s + aesd v3.16b,v12.16b + /* read first aes block, no bump */ + ld1 {v0.16b},[x0] + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v6.4s + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v3.16b,v3.16b + sha256h2 q23, q21, v7.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + eor v3.16b,v3.16b,v31.16b /* xor w/prev value */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + +/* + * now we have to do the 4 aes blocks (b-2) that catch up to where sha is + */ + +/* aes xform 0 */ + aesd v0.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b /* res 0 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + +/* aes xform 1 */ + aesd v1.16b,v8.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b /* res 1 */ + eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + +/* aes xform 2 */ + aesd v2.16b,v8.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + +/* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b /* res 3 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* + * Now, there is the final b-1 sha256 padded block. + * This contains between 0-3 aes blocks. We take some pains to avoid read spill + * by only reading the blocks that are actually defined. + * This is also the final sha block code for the shortCases. + */ +.Ljoin_common: + mov w15,0x80 /* that's the 1 of the pad */ + cbnz x13,.Lpad100 /* branch if there is some real data */ + eor v26.16b,v26.16b,v26.16b /* zero the rest */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v26.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad100: + sub x14,x13,1 /* dec amount left */ + ld1 {v26.16b},[x4],16 /* next w0 */ + cbnz x14,.Lpad200 /* branch if there is some real data */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v27.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad200: + sub x14,x14,1 /* dec amount left */ + ld1 {v27.16b},[x4],16 /* next w1 */ + cbnz x14,.Lpad300 /* branch if there is some real data */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v28.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad300: + ld1 {v28.16b},[x4],16 /* next w2 */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v29.b[3],w15 /* all data is bogus */ + +.Lpad_done: + /* Add one SHA-2 block since hash is calculated including i_key_pad */ + add x11, x11, #64 + lsr x12,x11,32 /* len_hi */ + and x14,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x14,x14,3 /* len_lo in bits */ + + mov v29.s[3],w14 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ +/* + * final sha block + * the strategy is to combine the 0-3 aes blocks, which is faster but + * a little gourmand on code space. + */ + cbz x13,.Lzero_aes_blocks_left /* none to do */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v31.16b},[x0],16 + + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + aesd v0.16b,v8.16b + ld1 {v7.16b},[x9],16 /* key3 */ + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + aesd v0.16b,v10.16b + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + aesimc v0.16b,v0.16b + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + aesd v0.16b,v11.16b + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v4.4s + aesd v0.16b,v12.16b + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + aesimc v0.16b,v0.16b + + sha256su0 v27.4s,v28.4s + aesd v0.16b,v13.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v14.16b + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v6.4s + aesd v0.16b,v15.16b + sha256h2 q23, q21, v6.4s + aesimc v0.16b,v0.16b + sha256su1 v28.4s,v26.4s,v27.4s + + aesd v0.16b,v16.16b + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v7.4s + aesd v0.16b,v17.16b + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */ + + sub x13,x13,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad1 + +/* aes xform 1 */ + + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v30.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aesd v0.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v0.16b,v0.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + aesd v0.16b,v9.16b + sha256su0 v26.4s,v27.4s + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + aesd v0.16b,v10.16b + sha256h q22, q23, v4.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v4.4s + aesd v0.16b,v11.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesimc v0.16b,v0.16b + + sha256su0 v27.4s,v28.4s + aesd v0.16b,v12.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v5.4s + aesd v0.16b,v13.16b + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + aesimc v0.16b,v0.16b + sha256su0 v28.4s,v29.4s + aesd v0.16b,v14.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v15.16b + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v0.16b,v0.16b + + sha256su0 v29.4s,v26.4s + aesd v0.16b,v16.16b + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v7.4s + aesd v0.16b,v17.16b + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + + sub x13,x13,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad2 + +/* aes xform 2 */ + + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + aesd v0.16b,v8.16b + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + aesimc v0.16b,v0.16b + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + aesd v0.16b,v9.16b + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v4.4s + aesd v0.16b,v10.16b + sha256h2 q23, q21, v4.4s + aesimc v0.16b,v0.16b + sha256su1 v26.4s,v28.4s,v29.4s + aesd v0.16b,v11.16b + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v5.4s + aesd v0.16b,v12.16b + sha256h2 q23, q21, v5.4s + aesimc v0.16b,v0.16b + sha256su1 v27.4s,v29.4s,v26.4s + aesd v0.16b,v13.16b + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + aesimc v0.16b,v0.16b + mov v21.16b, v22.16b /* copy abcd */ + aesd v0.16b,v14.16b + sha256h q22, q23, v6.4s + aesimc v0.16b,v0.16b + sha256h2 q23, q21, v6.4s + aesd v0.16b,v15.16b + sha256su1 v28.4s,v26.4s,v27.4s + aesimc v0.16b,v0.16b + + aesd v0.16b,v16.16b + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + aesimc v0.16b,v0.16b + sha256h q22, q23, v7.4s + aesd v0.16b,v17.16b + sha256h2 q23, q21, v7.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + sha256su1 v29.4s,v27.4s,v28.4s + eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + b .Lfrmquad3 +/* + * the final block with no aes component, i.e from here there were zero blocks + */ + +.Lzero_aes_blocks_left: + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ +.Lfrmquad1: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ +.Lfrmquad2: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ +.Lfrmquad3: + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha256h2 q23, q21, v7.4s + + add v26.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v27.4s,v25.4s,v23.4s /* EFGH += working copy */ + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* base address for sha round consts */ + adr x8,.Lrcon + /* load o_key_pad partial hash */ + ld1 {v24.16b,v25.16b}, [x6] + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v28.b[3], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+32 + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + ld1 {v4.16b},[x8],16 /* key0 */ + ld1 {v5.16b},[x8],16 /* key1 */ + ld1 {v6.16b},[x8],16 /* key2 */ + ld1 {v7.16b},[x8],16 /* key3 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key4 */ + ld1 {v5.16b},[x8],16 /* key5 */ + ld1 {v6.16b},[x8],16 /* key6 */ + ld1 {v7.16b},[x8],16 /* key7 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key8 */ + ld1 {v5.16b},[x8],16 /* key9 */ + ld1 {v6.16b},[x8],16 /* key10 */ + ld1 {v7.16b},[x8],16 /* key11 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x8],16 /* key12 */ + ld1 {v5.16b},[x8],16 /* key13 */ + ld1 {v6.16b},[x8],16 /* key14 */ + ld1 {v7.16b},[x8],16 /* key15 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + st1 {v24.4s,v25.4s},[x3] /* save them both */ + + mov x9,sp + add sp,sp,8*16 + ld1 {v8.16b - v11.16b},[x9],4*16 + ld1 {v12.16b - v15.16b},[x9] + + st1 {v24.4s,v25.4s},[x3] /* save them both */ + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lshort_cases: + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + ld1 {v30.16b},[x5] /* get ivec */ + ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */ + ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */ + ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */ + adr x8,.Lrcon /* rcon */ + lsl x11,x10,4 /* len=aes_blocks*16 */ + mov x4,x0 /* sha_ptr_in = in */ + +/* + * This loop does 4 at a time, so that at the end there is a final sha block + * and 0-3 aes blocks. + * Note that everything is done serially to avoid complication. + */ +.Lshort_loop: + cmp x10,4 /* check if 4 or more */ + /* if less, bail to last block */ + blt .Llast_sha_block + + ld1 {v31.16b},[x4] /* next w no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x4],16 + rev32 v26.16b,v0.16b /* endian swap for sha */ + add x0,x0,64 + +/* aes xform 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + eor v0.16b,v0.16b,v30.16b /* xor w/prev value */ + + ld1 {v30.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x4],16 + rev32 v27.16b,v1.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + +/* aes xform 1 */ + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + eor v1.16b,v1.16b,v31.16b /* xor w/prev value */ + + ld1 {v31.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x4],16 + rev32 v28.16b,v2.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + +/* aes xform 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + eor v2.16b,v2.16b,v30.16b /* xor w/prev value */ + + ld1 {v30.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x4],16 + rev32 v29.16b,v3.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + +/* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b + eor v3.16b,v3.16b,v31.16b /* xor w/prev value */ + +/* + * now we have the sha256 to do for these 4 aes blocks. Note that. + */ + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + ld1 {v5.16b},[x9],16 /* key1 */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + +/* quad 0 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 1 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 2 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + + sha256su0 v26.4s,v27.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su1 v26.4s,v28.4s,v29.4s + + sha256su0 v27.4s,v28.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su1 v27.4s,v29.4s,v26.4s + + sha256su0 v28.4s,v29.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su1 v28.4s,v26.4s,v27.4s + + sha256su0 v29.4s,v26.4s + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su1 v29.4s,v27.4s,v28.4s + +/* quad 3 */ + ld1 {v4.16b},[x9],16 /* key4 */ + ld1 {v5.16b},[x9],16 /* key5 */ + ld1 {v6.16b},[x9],16 /* key6 */ + ld1 {v7.16b},[x9],16 /* key7 */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + sub x10,x10,4 /* 4 less */ + b .Lshort_loop /* keep looping */ +/* + * This is arranged so that we can join the common unwind code that does + * the last sha block and the final 0-3 aes blocks. + */ +.Llast_sha_block: + mov x13,x10 /* copy aes blocks for common */ + b .Ljoin_common /* join common code */ + + .size sha256_hmac_aes128cbc_dec, .-sha256_hmac_aes128cbc_dec -- 1.9.1