Page MenuHome GnuPG

No OneTemporary

diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index 10213bfb..843ad9cf 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -1,3416 +1,3416 @@
/* VAES/AVX2 AMD64 accelerated AES for Libgcrypt
- * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2021,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#if defined(__x86_64__)
#include <config.h>
#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) && \
defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
#include "asm-common-amd64.h"
.text
/**********************************************************************
helper macros
**********************************************************************/
#define no(...) /*_*/
#define yes(...) __VA_ARGS__
#define AES_OP8(op, key, b0, b1, b2, b3, b4, b5, b6, b7) \
op key, b0, b0; \
op key, b1, b1; \
op key, b2, b2; \
op key, b3, b3; \
op key, b4, b4; \
op key, b5, b5; \
op key, b6, b6; \
op key, b7, b7;
#define VAESENC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \
AES_OP8(vaesenc, key, b0, b1, b2, b3, b4, b5, b6, b7)
#define VAESDEC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \
AES_OP8(vaesdec, key, b0, b1, b2, b3, b4, b5, b6, b7)
#define XOR8(key, b0, b1, b2, b3, b4, b5, b6, b7) \
AES_OP8(vpxor, key, b0, b1, b2, b3, b4, b5, b6, b7)
#define AES_OP4(op, key, b0, b1, b2, b3) \
op key, b0, b0; \
op key, b1, b1; \
op key, b2, b2; \
op key, b3, b3;
#define VAESENC4(key, b0, b1, b2, b3) \
AES_OP4(vaesenc, key, b0, b1, b2, b3)
#define VAESDEC4(key, b0, b1, b2, b3) \
AES_OP4(vaesdec, key, b0, b1, b2, b3)
#define XOR4(key, b0, b1, b2, b3) \
AES_OP4(vpxor, key, b0, b1, b2, b3)
#define AES_OP2(op, key, b0, b1) \
op key, b0, b0; \
op key, b1, b1;
#define VAESENC2(key, b0, b1) \
AES_OP2(vaesenc, key, b0, b1)
#define VAESDEC2(key, b0, b1) \
AES_OP2(vaesdec, key, b0, b1)
#define XOR2(key, b0, b1) \
AES_OP2(vpxor, key, b0, b1)
/**********************************************************************
CBC-mode decryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_cbc_dec_amd64,@function)
.globl _gcry_vaes_avx2_cbc_dec_amd64
.align 16
_gcry_vaes_avx2_cbc_dec_amd64:
/* input:
* %rdi: round keys
* %rsi: iv
* %rdx: dst
* %rcx: src
* %r8: nblocks
* %r9: nrounds
*/
CFI_STARTPROC();
/* Load IV. */
vmovdqu (%rsi), %xmm15;
/* Process 16 blocks per loop. */
.align 8
.Lcbc_dec_blk16:
cmpq $16, %r8;
jb .Lcbc_dec_blk8;
leaq -16(%r8), %r8;
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm8;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
vmovdqu (8 * 16)(%rcx), %ymm4;
vmovdqu (10 * 16)(%rcx), %ymm5;
vmovdqu (12 * 16)(%rcx), %ymm6;
vmovdqu (14 * 16)(%rcx), %ymm7;
vpxor %ymm8, %ymm0, %ymm0;
vpxor %ymm8, %ymm1, %ymm1;
vpxor %ymm8, %ymm2, %ymm2;
vpxor %ymm8, %ymm3, %ymm3;
vpxor %ymm8, %ymm4, %ymm4;
vpxor %ymm8, %ymm5, %ymm5;
vpxor %ymm8, %ymm6, %ymm6;
vpxor %ymm8, %ymm7, %ymm7;
vbroadcasti128 (1 * 16)(%rdi), %ymm8;
vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm9;
vmovdqu (1 * 16)(%rcx), %ymm10;
vmovdqu (3 * 16)(%rcx), %ymm11;
vmovdqu (5 * 16)(%rcx), %ymm12;
vmovdqu (7 * 16)(%rcx), %ymm13;
vmovdqu (9 * 16)(%rcx), %ymm14;
vmovdqu (15 * 16)(%rcx), %xmm15;
leaq (16 * 16)(%rcx), %rcx;
/* AES rounds */
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (10 * 16)(%rdi), %ymm8;
cmpl $12, %r9d;
jb .Lcbc_dec_blk16_last;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (12 * 16)(%rdi), %ymm8;
jz .Lcbc_dec_blk16_last;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (14 * 16)(%rdi), %ymm8;
/* Last round and output handling. */
.Lcbc_dec_blk16_last:
vpxor %ymm8, %ymm9, %ymm9;
vpxor %ymm8, %ymm10, %ymm10;
vpxor %ymm8, %ymm11, %ymm11;
vpxor %ymm8, %ymm12, %ymm12;
vpxor %ymm8, %ymm13, %ymm13;
vpxor %ymm8, %ymm14, %ymm14;
vaesdeclast %ymm9, %ymm0, %ymm0;
vaesdeclast %ymm10, %ymm1, %ymm1;
vpxor (-5 * 16)(%rcx), %ymm8, %ymm9;
vpxor (-3 * 16)(%rcx), %ymm8, %ymm10;
vaesdeclast %ymm11, %ymm2, %ymm2;
vaesdeclast %ymm12, %ymm3, %ymm3;
vaesdeclast %ymm13, %ymm4, %ymm4;
vaesdeclast %ymm14, %ymm5, %ymm5;
vaesdeclast %ymm9, %ymm6, %ymm6;
vaesdeclast %ymm10, %ymm7, %ymm7;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
leaq (16 * 16)(%rdx), %rdx;
jmp .Lcbc_dec_blk16;
/* Handle trailing eight blocks. */
.align 8
.Lcbc_dec_blk8:
cmpq $8, %r8;
jb .Lcbc_dec_blk4;
leaq -8(%r8), %r8;
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vpxor %ymm4, %ymm2, %ymm2;
vpxor %ymm4, %ymm3, %ymm3;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10;
vmovdqu (1 * 16)(%rcx), %ymm11;
vmovdqu (3 * 16)(%rcx), %ymm12;
vmovdqu (5 * 16)(%rcx), %ymm13;
vmovdqu (7 * 16)(%rcx), %xmm15;
leaq (8 * 16)(%rcx), %rcx;
/* AES rounds */
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lcbc_dec_blk8_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lcbc_dec_blk8_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lcbc_dec_blk8_last:
vpxor %ymm4, %ymm10, %ymm10;
vpxor %ymm4, %ymm11, %ymm11;
vpxor %ymm4, %ymm12, %ymm12;
vpxor %ymm4, %ymm13, %ymm13;
vaesdeclast %ymm10, %ymm0, %ymm0;
vaesdeclast %ymm11, %ymm1, %ymm1;
vaesdeclast %ymm12, %ymm2, %ymm2;
vaesdeclast %ymm13, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
/* Handle trailing four blocks. */
.align 8
.Lcbc_dec_blk4:
cmpq $4, %r8;
jb .Lcbc_dec_blk1;
leaq -4(%r8), %r8;
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10;
vmovdqu (1 * 16)(%rcx), %ymm11;
vmovdqu (3 * 16)(%rcx), %xmm15;
leaq (4 * 16)(%rcx), %rcx;
/* AES rounds */
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lcbc_dec_blk4_last;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lcbc_dec_blk4_last;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lcbc_dec_blk4_last:
vpxor %ymm4, %ymm10, %ymm10;
vpxor %ymm4, %ymm11, %ymm11;
vaesdeclast %ymm10, %ymm0, %ymm0;
vaesdeclast %ymm11, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
/* Process trailing one to three blocks, one per loop. */
.align 8
.Lcbc_dec_blk1:
cmpq $1, %r8;
jb .Ldone_cbc_dec;
leaq -1(%r8), %r8;
/* Load input. */
vmovdqu (%rcx), %xmm2;
leaq 16(%rcx), %rcx;
/* Xor first key. */
vpxor (0 * 16)(%rdi), %xmm2, %xmm0;
/* AES rounds. */
vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lcbc_dec_blk1_last;
vaesdec %xmm1, %xmm0, %xmm0;
vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lcbc_dec_blk1_last;
vaesdec %xmm1, %xmm0, %xmm0;
vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
/* Last round and output handling. */
.Lcbc_dec_blk1_last:
vpxor %xmm1, %xmm15, %xmm15;
vaesdeclast %xmm15, %xmm0, %xmm0;
vmovdqa %xmm2, %xmm15;
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Lcbc_dec_blk1;
.align 8
.Ldone_cbc_dec:
/* Store IV. */
vmovdqu %xmm15, (%rsi);
vzeroall;
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64)
/**********************************************************************
CFB-mode decryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_cfb_dec_amd64,@function)
.globl _gcry_vaes_avx2_cfb_dec_amd64
.align 16
_gcry_vaes_avx2_cfb_dec_amd64:
/* input:
* %rdi: round keys
* %rsi: iv
* %rdx: dst
* %rcx: src
* %r8: nblocks
* %r9: nrounds
*/
CFI_STARTPROC();
/* Load IV. */
vmovdqu (%rsi), %xmm15;
/* Process 16 blocks per loop. */
.align 8
.Lcfb_dec_blk16:
cmpq $16, %r8;
jb .Lcfb_dec_blk8;
leaq -16(%r8), %r8;
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm8;
vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
vmovdqu (1 * 16)(%rcx), %ymm1;
vmovdqu (3 * 16)(%rcx), %ymm2;
vmovdqu (5 * 16)(%rcx), %ymm3;
vmovdqu (7 * 16)(%rcx), %ymm4;
vmovdqu (9 * 16)(%rcx), %ymm5;
vmovdqu (11 * 16)(%rcx), %ymm6;
vmovdqu (13 * 16)(%rcx), %ymm7;
vmovdqu (15 * 16)(%rcx), %xmm15;
vpxor %ymm8, %ymm0, %ymm0;
vpxor %ymm8, %ymm1, %ymm1;
vpxor %ymm8, %ymm2, %ymm2;
vpxor %ymm8, %ymm3, %ymm3;
vpxor %ymm8, %ymm4, %ymm4;
vpxor %ymm8, %ymm5, %ymm5;
vpxor %ymm8, %ymm6, %ymm6;
vpxor %ymm8, %ymm7, %ymm7;
vbroadcasti128 (1 * 16)(%rdi), %ymm8;
vmovdqu (0 * 16)(%rcx), %ymm9;
vmovdqu (2 * 16)(%rcx), %ymm10;
vmovdqu (4 * 16)(%rcx), %ymm11;
vmovdqu (6 * 16)(%rcx), %ymm12;
vmovdqu (8 * 16)(%rcx), %ymm13;
vmovdqu (10 * 16)(%rcx), %ymm14;
leaq (16 * 16)(%rcx), %rcx;
/* AES rounds */
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (10 * 16)(%rdi), %ymm8;
cmpl $12, %r9d;
jb .Lcfb_dec_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (12 * 16)(%rdi), %ymm8;
jz .Lcfb_dec_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (14 * 16)(%rdi), %ymm8;
/* Last round and output handling. */
.Lcfb_dec_blk16_last:
vpxor %ymm8, %ymm9, %ymm9;
vpxor %ymm8, %ymm10, %ymm10;
vpxor %ymm8, %ymm11, %ymm11;
vpxor %ymm8, %ymm12, %ymm12;
vpxor %ymm8, %ymm13, %ymm13;
vpxor %ymm8, %ymm14, %ymm14;
vaesenclast %ymm9, %ymm0, %ymm0;
vaesenclast %ymm10, %ymm1, %ymm1;
vpxor (-4 * 16)(%rcx), %ymm8, %ymm9;
vpxor (-2 * 16)(%rcx), %ymm8, %ymm10;
vaesenclast %ymm11, %ymm2, %ymm2;
vaesenclast %ymm12, %ymm3, %ymm3;
vaesenclast %ymm13, %ymm4, %ymm4;
vaesenclast %ymm14, %ymm5, %ymm5;
vaesenclast %ymm9, %ymm6, %ymm6;
vaesenclast %ymm10, %ymm7, %ymm7;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
leaq (16 * 16)(%rdx), %rdx;
jmp .Lcfb_dec_blk16;
/* Handle trailing eight blocks. */
.align 8
.Lcfb_dec_blk8:
cmpq $8, %r8;
jb .Lcfb_dec_blk4;
leaq -8(%r8), %r8;
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
vmovdqu (1 * 16)(%rcx), %ymm1;
vmovdqu (3 * 16)(%rcx), %ymm2;
vmovdqu (5 * 16)(%rcx), %ymm3;
vmovdqu (7 * 16)(%rcx), %xmm15;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vpxor %ymm4, %ymm2, %ymm2;
vpxor %ymm4, %ymm3, %ymm3;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
vmovdqu (0 * 16)(%rcx), %ymm10;
vmovdqu (2 * 16)(%rcx), %ymm11;
vmovdqu (4 * 16)(%rcx), %ymm12;
vmovdqu (6 * 16)(%rcx), %ymm13;
leaq (8 * 16)(%rcx), %rcx;
/* AES rounds */
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lcfb_dec_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lcfb_dec_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lcfb_dec_blk8_last:
vpxor %ymm4, %ymm10, %ymm10;
vpxor %ymm4, %ymm11, %ymm11;
vpxor %ymm4, %ymm12, %ymm12;
vpxor %ymm4, %ymm13, %ymm13;
vaesenclast %ymm10, %ymm0, %ymm0;
vaesenclast %ymm11, %ymm1, %ymm1;
vaesenclast %ymm12, %ymm2, %ymm2;
vaesenclast %ymm13, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
/* Handle trailing four blocks. */
.align 8
.Lcfb_dec_blk4:
cmpq $4, %r8;
jb .Lcfb_dec_blk1;
leaq -4(%r8), %r8;
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
vmovdqu (1 * 16)(%rcx), %ymm1;
vmovdqu (3 * 16)(%rcx), %xmm15;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
vmovdqu (0 * 16)(%rcx), %ymm10;
vmovdqu (2 * 16)(%rcx), %ymm11;
leaq (4 * 16)(%rcx), %rcx;
/* AES rounds */
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lcfb_dec_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lcfb_dec_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lcfb_dec_blk4_last:
vpxor %ymm4, %ymm10, %ymm10;
vpxor %ymm4, %ymm11, %ymm11;
vaesenclast %ymm10, %ymm0, %ymm0;
vaesenclast %ymm11, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
/* Process trailing one to three blocks, one per loop. */
.align 8
.Lcfb_dec_blk1:
cmpq $1, %r8;
jb .Ldone_cfb_dec;
leaq -1(%r8), %r8;
/* Xor first key. */
vpxor (0 * 16)(%rdi), %xmm15, %xmm0;
/* Load input as next IV. */
vmovdqu (%rcx), %xmm15;
leaq 16(%rcx), %rcx;
/* AES rounds. */
vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lcfb_dec_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lcfb_dec_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
/* Last round and output handling. */
.Lcfb_dec_blk1_last:
vpxor %xmm15, %xmm1, %xmm1;
vaesenclast %xmm1, %xmm0, %xmm0;
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Lcfb_dec_blk1;
.align 8
.Ldone_cfb_dec:
/* Store IV. */
vmovdqu %xmm15, (%rsi);
vzeroall;
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64)
/**********************************************************************
CTR-mode encryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_ctr_enc_amd64,@function)
.globl _gcry_vaes_avx2_ctr_enc_amd64
.align 16
_gcry_vaes_avx2_ctr_enc_amd64:
/* input:
* %rdi: round keys
* %rsi: counter
* %rdx: dst
* %rcx: src
* %r8: nblocks
* %r9: nrounds
*/
CFI_STARTPROC();
movq 8(%rsi), %r10;
movq 0(%rsi), %r11;
bswapq %r10;
bswapq %r11;
vpcmpeqd %ymm15, %ymm15, %ymm15;
vpsrldq $8, %ymm15, %ymm15; // 0:-1
vpaddq %ymm15, %ymm15, %ymm14; // 0:-2
vbroadcasti128 .Lbswap128_mask rRIP, %ymm13;
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
vpsubq minus_one, x, x; \
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;
#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
vpcmpeqq minus_one, x, tmp1; \
vpcmpeqq minus_two, x, tmp2; \
vpor tmp1, tmp2, tmp2; \
vpsubq minus_two, x, x; \
vpslldq $8, tmp2, tmp2; \
vpsubq tmp2, x, x;
#define handle_ctr_128bit_add(nblks) \
addq $(nblks), %r10; \
adcq $0, %r11; \
bswapq %r10; \
bswapq %r11; \
movq %r10, 8(%rsi); \
movq %r11, 0(%rsi); \
bswapq %r10; \
bswapq %r11;
/* Process 16 blocks per loop. */
.align 8
.Lctr_enc_blk16:
cmpq $16, %r8;
jb .Lctr_enc_blk8;
leaq -16(%r8), %r8;
vbroadcasti128 (%rsi), %ymm7;
vbroadcasti128 (0 * 16)(%rdi), %ymm8;
/* detect if carry handling is needed */
addb $16, 15(%rsi);
jc .Lctr_enc_blk16_handle_carry;
leaq 16(%r10), %r10;
.Lctr_enc_blk16_byte_bige_add:
/* Increment counters. */
vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0;
vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1;
vpaddb .Lbige_addb_4 rRIP, %ymm7, %ymm2;
vpaddb .Lbige_addb_6 rRIP, %ymm7, %ymm3;
vpaddb .Lbige_addb_8 rRIP, %ymm7, %ymm4;
vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5;
vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6;
vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7;
.Lctr_enc_blk16_rounds:
/* AES rounds */
XOR8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (1 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (10 * 16)(%rdi), %ymm8;
cmpl $12, %r9d;
jb .Lctr_enc_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (12 * 16)(%rdi), %ymm8;
jz .Lctr_enc_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (14 * 16)(%rdi), %ymm8;
/* Last round and output handling. */
.Lctr_enc_blk16_last:
vpxor (0 * 16)(%rcx), %ymm8, %ymm9; /* Xor src to last round key. */
vpxor (2 * 16)(%rcx), %ymm8, %ymm10;
vpxor (4 * 16)(%rcx), %ymm8, %ymm11;
vpxor (6 * 16)(%rcx), %ymm8, %ymm12;
vaesenclast %ymm9, %ymm0, %ymm0;
vaesenclast %ymm10, %ymm1, %ymm1;
vaesenclast %ymm11, %ymm2, %ymm2;
vaesenclast %ymm12, %ymm3, %ymm3;
vpxor (8 * 16)(%rcx), %ymm8, %ymm9;
vpxor (10 * 16)(%rcx), %ymm8, %ymm10;
vpxor (12 * 16)(%rcx), %ymm8, %ymm11;
vpxor (14 * 16)(%rcx), %ymm8, %ymm8;
leaq (16 * 16)(%rcx), %rcx;
vaesenclast %ymm9, %ymm4, %ymm4;
vaesenclast %ymm10, %ymm5, %ymm5;
vaesenclast %ymm11, %ymm6, %ymm6;
vaesenclast %ymm8, %ymm7, %ymm7;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
leaq (16 * 16)(%rdx), %rdx;
jmp .Lctr_enc_blk16;
.align 8
.Lctr_enc_blk16_handle_only_ctr_carry:
handle_ctr_128bit_add(16);
jmp .Lctr_enc_blk16_byte_bige_add;
.align 8
.Lctr_enc_blk16_handle_carry:
jz .Lctr_enc_blk16_handle_only_ctr_carry;
/* Increment counters (handle carry). */
vpshufb %xmm13, %xmm7, %xmm1; /* be => le */
vmovdqa %xmm1, %xmm0;
inc_le128(%xmm1, %xmm15, %xmm5);
vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */
vpshufb %ymm13, %ymm7, %ymm0;
handle_ctr_128bit_add(16);
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */
vpshufb %ymm13, %ymm7, %ymm1;
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */
vpshufb %ymm13, %ymm7, %ymm2;
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +7:+6 */
vpshufb %ymm13, %ymm7, %ymm3;
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +9:+8 */
vpshufb %ymm13, %ymm7, %ymm4;
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +11:+10 */
vpshufb %ymm13, %ymm7, %ymm5;
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +13:+12 */
vpshufb %ymm13, %ymm7, %ymm6;
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +15:+14 */
vpshufb %ymm13, %ymm7, %ymm7;
jmp .Lctr_enc_blk16_rounds;
/* Handle trailing eight blocks. */
.align 8
.Lctr_enc_blk8:
cmpq $8, %r8;
jb .Lctr_enc_blk4;
leaq -8(%r8), %r8;
vbroadcasti128 (%rsi), %ymm3;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
/* detect if carry handling is needed */
addb $8, 15(%rsi);
jc .Lctr_enc_blk8_handle_carry;
leaq 8(%r10), %r10;
.Lctr_enc_blk8_byte_bige_add:
/* Increment counters. */
vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2;
vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3;
.Lctr_enc_blk8_rounds:
/* AES rounds */
XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lctr_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lctr_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lctr_enc_blk8_last:
vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */
vpxor (2 * 16)(%rcx), %ymm4, %ymm6;
vpxor (4 * 16)(%rcx), %ymm4, %ymm7;
vpxor (6 * 16)(%rcx), %ymm4, %ymm4;
leaq (8 * 16)(%rcx), %rcx;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vaesenclast %ymm7, %ymm2, %ymm2;
vaesenclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
jmp .Lctr_enc_blk4;
.align 8
.Lctr_enc_blk8_handle_only_ctr_carry:
handle_ctr_128bit_add(8);
jmp .Lctr_enc_blk8_byte_bige_add;
.align 8
.Lctr_enc_blk8_handle_carry:
jz .Lctr_enc_blk8_handle_only_ctr_carry;
/* Increment counters (handle carry). */
vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
vmovdqa %xmm1, %xmm0;
inc_le128(%xmm1, %xmm15, %xmm5);
vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
vpshufb %ymm13, %ymm3, %ymm0;
handle_ctr_128bit_add(8);
add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
vpshufb %ymm13, %ymm3, %ymm1;
add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */
vpshufb %ymm13, %ymm3, %ymm2;
add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +7:+6 */
vpshufb %ymm13, %ymm3, %ymm3;
jmp .Lctr_enc_blk8_rounds;
/* Handle trailing four blocks. */
.align 8
.Lctr_enc_blk4:
cmpq $4, %r8;
jb .Lctr_enc_blk1;
leaq -4(%r8), %r8;
vbroadcasti128 (%rsi), %ymm3;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
/* detect if carry handling is needed */
addb $4, 15(%rsi);
jc .Lctr_enc_blk4_handle_carry;
leaq 4(%r10), %r10;
.Lctr_enc_blk4_byte_bige_add:
/* Increment counters. */
vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
.Lctr_enc_blk4_rounds:
/* AES rounds */
XOR2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lctr_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lctr_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lctr_enc_blk4_last:
vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */
vpxor (2 * 16)(%rcx), %ymm4, %ymm6;
leaq (4 * 16)(%rcx), %rcx;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
jmp .Lctr_enc_blk1;
.align 8
.Lctr_enc_blk4_handle_only_ctr_carry:
handle_ctr_128bit_add(4);
jmp .Lctr_enc_blk4_byte_bige_add;
.align 8
.Lctr_enc_blk4_handle_carry:
jz .Lctr_enc_blk4_handle_only_ctr_carry;
/* Increment counters (handle carry). */
vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
vmovdqa %xmm1, %xmm0;
inc_le128(%xmm1, %xmm15, %xmm5);
vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
vpshufb %ymm13, %ymm3, %ymm0;
handle_ctr_128bit_add(4);
add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
vpshufb %ymm13, %ymm3, %ymm1;
jmp .Lctr_enc_blk4_rounds;
/* Process trailing one to three blocks, one per loop. */
.align 8
.Lctr_enc_blk1:
cmpq $1, %r8;
jb .Ldone_ctr_enc;
leaq -1(%r8), %r8;
/* Load and increament counter. */
vmovdqu (%rsi), %xmm0;
handle_ctr_128bit_add(1);
/* AES rounds. */
vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lctr_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lctr_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
/* Last round and output handling. */
.Lctr_enc_blk1_last:
vpxor (%rcx), %xmm1, %xmm1; /* Xor src to last round key. */
leaq 16(%rcx), %rcx;
vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Lctr_enc_blk1;
.align 8
.Ldone_ctr_enc:
vzeroall;
xorl %r10d, %r10d;
xorl %r11d, %r11d;
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64)
/**********************************************************************
Little-endian 32-bit CTR-mode encryption (GCM-SIV)
**********************************************************************/
ELF(.type _gcry_vaes_avx2_ctr32le_enc_amd64,@function)
.globl _gcry_vaes_avx2_ctr32le_enc_amd64
.align 16
_gcry_vaes_avx2_ctr32le_enc_amd64:
/* input:
* %rdi: round keys
* %rsi: counter
* %rdx: dst
* %rcx: src
* %r8: nblocks
* %r9: nrounds
*/
CFI_STARTPROC();
vbroadcasti128 (%rsi), %ymm15; // CTR
/* Process 16 blocks per loop. */
.align 8
.Lctr32le_enc_blk16:
cmpq $16, %r8;
jb .Lctr32le_enc_blk8;
leaq -16(%r8), %r8;
vbroadcasti128 (0 * 16)(%rdi), %ymm8;
/* Increment counters. */
vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0;
vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1;
vpaddd .Lle_addd_4 rRIP, %ymm15, %ymm2;
vpaddd .Lle_addd_6 rRIP, %ymm15, %ymm3;
vpaddd .Lle_addd_8 rRIP, %ymm15, %ymm4;
vpaddd .Lle_addd_10 rRIP, %ymm15, %ymm5;
vpaddd .Lle_addd_12 rRIP, %ymm15, %ymm6;
vpaddd .Lle_addd_14 rRIP, %ymm15, %ymm7;
vpaddd .Lle_addd_16_2 rRIP, %ymm15, %ymm15;
/* AES rounds */
XOR8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (1 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (10 * 16)(%rdi), %ymm8;
cmpl $12, %r9d;
jb .Lctr32le_enc_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (12 * 16)(%rdi), %ymm8;
jz .Lctr32le_enc_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (14 * 16)(%rdi), %ymm8;
/* Last round and output handling. */
.Lctr32le_enc_blk16_last:
vpxor (0 * 16)(%rcx), %ymm8, %ymm9; /* Xor src to last round key. */
vpxor (2 * 16)(%rcx), %ymm8, %ymm10;
vpxor (4 * 16)(%rcx), %ymm8, %ymm11;
vpxor (6 * 16)(%rcx), %ymm8, %ymm12;
vaesenclast %ymm9, %ymm0, %ymm0;
vaesenclast %ymm10, %ymm1, %ymm1;
vaesenclast %ymm11, %ymm2, %ymm2;
vaesenclast %ymm12, %ymm3, %ymm3;
vpxor (8 * 16)(%rcx), %ymm8, %ymm9;
vpxor (10 * 16)(%rcx), %ymm8, %ymm10;
vpxor (12 * 16)(%rcx), %ymm8, %ymm11;
vpxor (14 * 16)(%rcx), %ymm8, %ymm8;
leaq (16 * 16)(%rcx), %rcx;
vaesenclast %ymm9, %ymm4, %ymm4;
vaesenclast %ymm10, %ymm5, %ymm5;
vaesenclast %ymm11, %ymm6, %ymm6;
vaesenclast %ymm8, %ymm7, %ymm7;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
leaq (16 * 16)(%rdx), %rdx;
jmp .Lctr32le_enc_blk16;
/* Handle trailing eight blocks. */
.align 8
.Lctr32le_enc_blk8:
cmpq $8, %r8;
jb .Lctr32le_enc_blk4;
leaq -8(%r8), %r8;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
/* Increment counters. */
vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0;
vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1;
vpaddd .Lle_addd_4 rRIP, %ymm15, %ymm2;
vpaddd .Lle_addd_6 rRIP, %ymm15, %ymm3;
vpaddd .Lle_addd_8_2 rRIP, %ymm15, %ymm15;
/* AES rounds */
XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lctr32le_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lctr32le_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lctr32le_enc_blk8_last:
vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */
vpxor (2 * 16)(%rcx), %ymm4, %ymm6;
vpxor (4 * 16)(%rcx), %ymm4, %ymm7;
vpxor (6 * 16)(%rcx), %ymm4, %ymm4;
leaq (8 * 16)(%rcx), %rcx;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vaesenclast %ymm7, %ymm2, %ymm2;
vaesenclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
/* Handle trailing four blocks. */
.align 8
.Lctr32le_enc_blk4:
cmpq $4, %r8;
jb .Lctr32le_enc_blk1;
leaq -4(%r8), %r8;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
/* Increment counters. */
vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0;
vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1;
vpaddd .Lle_addd_4_2 rRIP, %ymm15, %ymm15;
/* AES rounds */
XOR2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lctr32le_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lctr32le_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lctr32le_enc_blk4_last:
vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */
vpxor (2 * 16)(%rcx), %ymm4, %ymm6;
leaq (4 * 16)(%rcx), %rcx;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
/* Process trailing one to three blocks, one per loop. */
.align 8
.Lctr32le_enc_blk1:
cmpq $1, %r8;
jb .Ldone_ctr32le_enc;
leaq -1(%r8), %r8;
/* Load and increament counter. */
vmovdqu %xmm15, %xmm0;
vpaddd .Lle_addd_1 rRIP, %xmm15, %xmm15;
/* AES rounds. */
vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lctr32le_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lctr32le_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
/* Last round and output handling. */
.Lctr32le_enc_blk1_last:
vpxor (%rcx), %xmm1, %xmm1; /* Xor src to last round key. */
leaq 16(%rcx), %rcx;
vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Lctr32le_enc_blk1;
.align 8
.Ldone_ctr32le_enc:
vmovdqu %xmm15, (%rsi);
vzeroall;
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)
/**********************************************************************
OCB-mode encryption/decryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64,@function)
.globl _gcry_vaes_avx2_ocb_crypt_amd64
.align 16
_gcry_vaes_avx2_ocb_crypt_amd64:
/* input:
* %rdi: round keys
* %esi: nblk
* %rdx: dst
* %rcx: src
* %r8: nblocks
* %r9: nrounds
* 16(%rbp): offset
* 24(%rbp): checksum
* 32(%rbp): L-array
* 40(%rbp): encrypt (%r15d)
*/
CFI_STARTPROC();
#define STACK_REGS_POS (16 * 16 + 4 * 16 + 2 * 16)
#define STACK_ALLOC (STACK_REGS_POS + 5 * 8)
#define OFFSET_PTR_Q 16(%rbp)
#define CHECKSUM_PTR_Q 24(%rbp)
#define L_ARRAY_PTR_L 32(%rbp)
#define ENCRYPT_FLAG_L 40(%rbp)
pushq %rbp;
CFI_PUSH(%rbp);
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);
subq $STACK_ALLOC, %rsp;
andq $~63, %rsp;
movq %r12, (STACK_REGS_POS + 0 * 8)(%rsp);
CFI_REG_ON_STACK(r12, STACK_REGS_POS + 0 * 8);
movq %r13, (STACK_REGS_POS + 1 * 8)(%rsp);
CFI_REG_ON_STACK(r13, STACK_REGS_POS + 1 * 8);
movq %r14, (STACK_REGS_POS + 2 * 8)(%rsp);
CFI_REG_ON_STACK(r14, STACK_REGS_POS + 2 * 8);
movq %r15, (STACK_REGS_POS + 3 * 8)(%rsp);
CFI_REG_ON_STACK(r15, STACK_REGS_POS + 3 * 8);
movq %rbx, (STACK_REGS_POS + 4 * 8)(%rsp);
CFI_REG_ON_STACK(rbx, STACK_REGS_POS + 4 * 8);
movl ENCRYPT_FLAG_L, %r15d; /* encrypt-flag. */
movq OFFSET_PTR_Q, %r14; /* offset ptr. */
movq CHECKSUM_PTR_Q, %rbx; /* checksum ptr. */
leal (, %r9d, 4), %eax;
vmovdqu (%r14), %xmm15; /* Load offset. */
movq L_ARRAY_PTR_L, %r14; /* L-array ptr. */
vmovdqa (0 * 16)(%rdi), %xmm0; /* first key */
vpxor %xmm14, %xmm14, %xmm14;
vpxor %xmm13, %xmm13, %xmm13;
vpxor (%rdi, %rax, 4), %xmm0, %xmm0; /* first key ^ last key */
vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key */
vmovdqa %xmm0, (14 * 16)(%rsp);
vmovdqa %xmm0, (15 * 16)(%rsp);
.align 8
.Lhandle_unaligned_ocb:
/* Get number of blocks to align nblk to 16 (and L-array optimization). */
movl %esi, %r10d;
negl %r10d;
andl $15, %r10d;
cmpq %r8, %r10;
cmovaq %r8, %r10;
cmpq $1, %r10;
jb .Lunaligned_ocb_done;
/* Number of blocks after alignment. */
movq %r8, %r11;
subq %r10, %r11;
/* If number after alignment is less than 16, skip aligned handling
* completely. */
cmp $16, %r11;
cmovbq %r8, %r10;
/* Unaligned: Process eight blocks per loop. */
.align 8
.Locb_unaligned_blk8:
cmpq $8, %r10;
jb .Locb_unaligned_blk4;
leaq -8(%r8), %r8;
leaq -8(%r10), %r10;
leal 1(%esi), %r11d;
leal 2(%esi), %r12d;
leal 3(%esi), %r13d;
leal 4(%esi), %eax;
tzcntl %r11d, %r11d;
tzcntl %r12d, %r12d;
tzcntl %r13d, %r13d;
tzcntl %eax, %eax;
shll $4, %r11d;
shll $4, %r12d;
shll $4, %r13d;
shll $4, %eax;
vpxor (%r14, %r11), %xmm15, %xmm5;
vpxor (%r14, %r12), %xmm5, %xmm6;
vpxor (%r14, %r13), %xmm6, %xmm7;
vpxor (%r14, %rax), %xmm7, %xmm8;
leal 5(%esi), %r11d;
leal 6(%esi), %r12d;
leal 7(%esi), %r13d;
leal 8(%esi), %esi;
tzcntl %r11d, %r11d;
tzcntl %r12d, %r12d;
tzcntl %r13d, %r13d;
tzcntl %esi, %eax;
shll $4, %r11d;
shll $4, %r12d;
shll $4, %r13d;
shll $4, %eax;
vpxor (%r14, %r11), %xmm8, %xmm9;
vpxor (%r14, %r12), %xmm9, %xmm10;
vpxor (%r14, %r13), %xmm10, %xmm11;
vpxor (%r14, %rax), %xmm11, %xmm15;
vinserti128 $1, %xmm6, %ymm5, %ymm5;
vinserti128 $1, %xmm8, %ymm7, %ymm6;
vinserti128 $1, %xmm10, %ymm9, %ymm7;
vinserti128 $1, %xmm15, %ymm11, %ymm8;
testl %r15d, %r15d;
jz .Locb_unaligned_blk8_dec;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
leaq (8 * 16)(%rcx), %rcx;
vpxor %ymm0, %ymm14, %ymm14;
vpxor %ymm1, %ymm13, %ymm13;
vpxor %ymm2, %ymm14, %ymm14;
vpxor %ymm3, %ymm13, %ymm13;
vpxor %ymm5, %ymm0, %ymm0;
vpxor %ymm6, %ymm1, %ymm1;
vpxor %ymm7, %ymm2, %ymm2;
vpxor %ymm8, %ymm3, %ymm3;
vmovdqa (14 * 16)(%rsp), %ymm9;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
cmpl $12, %r9d;
jb .Locb_unaligned_blk8_enc_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
jz .Locb_unaligned_blk8_enc_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
/* Last round and output handling. */
.Locb_unaligned_blk8_enc_last:
vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */
vpxor %ymm6, %ymm9, %ymm6;
vpxor %ymm7, %ymm9, %ymm7;
vpxor %ymm8, %ymm9, %ymm4;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vaesenclast %ymm7, %ymm2, %ymm2;
vaesenclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
jmp .Locb_unaligned_blk8;
.align 8
.Locb_unaligned_blk8_dec:
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
leaq (8 * 16)(%rcx), %rcx;
vmovdqa (14 * 16)(%rsp), %ymm9;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
cmpl $12, %r9d;
jb .Locb_unaligned_blk8_dec_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
jz .Locb_unaligned_blk8_dec_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
/* Last round and output handling. */
.Locb_unaligned_blk8_dec_last:
vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */
vpxor %ymm6, %ymm9, %ymm6;
vpxor %ymm7, %ymm9, %ymm7;
vpxor %ymm8, %ymm9, %ymm4;
vaesdeclast %ymm5, %ymm0, %ymm0;
vaesdeclast %ymm6, %ymm1, %ymm1;
vaesdeclast %ymm7, %ymm2, %ymm2;
vaesdeclast %ymm4, %ymm3, %ymm3;
vpxor %ymm0, %ymm14, %ymm14;
vpxor %ymm1, %ymm13, %ymm13;
vpxor %ymm2, %ymm14, %ymm14;
vpxor %ymm3, %ymm13, %ymm13;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
jmp .Locb_unaligned_blk8;
/* Unaligned: Process four blocks. */
.align 8
.Locb_unaligned_blk4:
cmpq $4, %r10;
jb .Locb_unaligned_blk1;
leaq -4(%r8), %r8;
leaq -4(%r10), %r10;
leal 1(%esi), %r11d;
leal 2(%esi), %r12d;
leal 3(%esi), %r13d;
leal 4(%esi), %esi;
tzcntl %r11d, %r11d;
tzcntl %r12d, %r12d;
tzcntl %r13d, %r13d;
tzcntl %esi, %eax;
shll $4, %r11d;
shll $4, %r12d;
shll $4, %r13d;
shll $4, %eax;
vpxor (%r14, %r11), %xmm15, %xmm5;
vpxor (%r14, %r12), %xmm5, %xmm6;
vinserti128 $1, %xmm6, %ymm5, %ymm5;
vpxor (%r14, %r13), %xmm6, %xmm7;
vpxor (%r14, %rax), %xmm7, %xmm15;
vinserti128 $1, %xmm15, %ymm7, %ymm6;
testl %r15d, %r15d;
jz .Locb_unaligned_blk4_dec;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
leaq (4 * 16)(%rcx), %rcx;
vpxor %ymm0, %ymm14, %ymm14;
vpxor %ymm1, %ymm13, %ymm13;
vpxor %ymm5, %ymm0, %ymm0;
vpxor %ymm6, %ymm1, %ymm1;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
cmpl $12, %r9d;
jb .Locb_unaligned_blk4_enc_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
jz .Locb_unaligned_blk4_enc_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
/* Last round and output handling. */
.Locb_unaligned_blk4_enc_last:
vmovdqa (14 * 16)(%rsp), %ymm8;
vpxor %ymm5, %ymm8, %ymm5; /* Xor src to last round key. */
vpxor %ymm6, %ymm8, %ymm6;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
jmp .Locb_unaligned_blk1;
.align 8
.Locb_unaligned_blk4_dec:
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
leaq (4 * 16)(%rcx), %rcx;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
cmpl $12, %r9d;
jb .Locb_unaligned_blk4_dec_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
jz .Locb_unaligned_blk4_dec_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
/* Last round and output handling. */
.Locb_unaligned_blk4_dec_last:
vmovdqa (14 * 16)(%rsp), %ymm8;
vpxor %ymm5, %ymm8, %ymm5; /* Xor src to last round key. */
vpxor %ymm6, %ymm8, %ymm6;
vaesdeclast %ymm5, %ymm0, %ymm0;
vaesdeclast %ymm6, %ymm1, %ymm1;
vpxor %ymm0, %ymm14, %ymm14;
vpxor %ymm1, %ymm13, %ymm13;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
/* Unaligned: Process one block per loop. */
.align 8
.Locb_unaligned_blk1:
cmpq $1, %r10;
jb .Lunaligned_ocb_done;
leaq -1(%r8), %r8;
leaq -1(%r10), %r10;
leal 1(%esi), %esi;
tzcntl %esi, %r11d;
shll $4, %r11d;
vpxor (%r14, %r11), %xmm15, %xmm15;
testl %r15d, %r15d;
jz .Locb_unaligned_blk1_dec;
vmovdqu (%rcx), %xmm0;
vpxor %ymm0, %ymm14, %ymm14;
vpxor %xmm15, %xmm0, %xmm0;
leaq 16(%rcx), %rcx;
/* AES rounds. */
vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
cmpl $12, %r9d;
jb .Locb_unaligned_blk1_enc_last;
vaesenc (10 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
jz .Locb_unaligned_blk1_enc_last;
vaesenc (12 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
/* Last round and output handling. */
.Locb_unaligned_blk1_enc_last:
vpxor (14 * 16)(%rsp), %xmm15, %xmm1;
vaesenclast %xmm1, %xmm0, %xmm0;
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Locb_unaligned_blk1;
.align 8
.Locb_unaligned_blk1_dec:
vpxor (%rcx), %xmm15, %xmm0;
leaq 16(%rcx), %rcx;
/* AES rounds. */
vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
cmpl $12, %r9d;
jb .Locb_unaligned_blk1_dec_last;
vaesdec (10 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
jz .Locb_unaligned_blk1_dec_last;
vaesdec (12 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
/* Last round and output handling. */
.Locb_unaligned_blk1_dec_last:
vpxor (14 * 16)(%rsp), %xmm15, %xmm1;
vaesdeclast %xmm1, %xmm0, %xmm0;
vpxor %ymm0, %ymm14, %ymm14;
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Locb_unaligned_blk1;
.align 8
.Lunaligned_ocb_done:
cmpq $1, %r8;
jb .Ldone_ocb;
/* Short buffers do not benefit from L-array optimization. */
movq %r8, %r10;
cmpq $16, %r8;
jb .Locb_unaligned_blk8;
vinserti128 $1, %xmm15, %ymm15, %ymm15;
/* Prepare L-array optimization.
* Since nblk is aligned to 16, offsets will have following
* construction:
* - block1 = ntz{0} = offset ^ L[0]
* - block2 = ntz{1} = offset ^ L[0] ^ L[1]
* - block3 = ntz{0} = offset ^ L[1]
* - block4 = ntz{2} = offset ^ L[1] ^ L[2]
* - block5 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[2]
* - block6 = ntz{1} = offset ^ L[0] ^ L[2]
* - block7 = ntz{0} = offset ^ L[2]
* - block8 = ntz{3} = offset ^ L[2] ^ L[3]
* - block9 = ntz{0} = offset ^ L[0] ^ L[2] ^ L[3]
* - block10 = ntz{1} = offset ^ L[0] ^ L[1] ^ L[2] ^ L[3]
* - block11 = ntz{0} = offset ^ L[1] ^ L[2] ^ L[3]
* - block12 = ntz{2} = offset ^ L[1] ^ L[3]
* - block13 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[3]
* - block14 = ntz{1} = offset ^ L[0] ^ L[3]
* - block15 = ntz{0} = offset ^ L[3]
* - block16 = ntz{x} = offset ^ L[3] ^ L[ntz{x}]
*/
vmovdqu (0 * 16)(%r14), %xmm0;
vmovdqu (1 * 16)(%r14), %xmm1;
vmovdqu (2 * 16)(%r14), %xmm2;
vmovdqu (3 * 16)(%r14), %xmm3;
vpxor %ymm13, %ymm14, %ymm14;
vmovdqa %ymm14, (20 * 16)(%rsp);
vpxor %xmm0, %xmm1, %xmm4; /* L[0] ^ L[1] */
vpxor %xmm0, %xmm2, %xmm5; /* L[0] ^ L[2] */
vpxor %xmm0, %xmm3, %xmm6; /* L[0] ^ L[3] */
vpxor %xmm1, %xmm2, %xmm7; /* L[1] ^ L[2] */
vpxor %xmm1, %xmm3, %xmm8; /* L[1] ^ L[3] */
vpxor %xmm2, %xmm3, %xmm9; /* L[2] ^ L[3] */
vpxor %xmm4, %xmm2, %xmm10; /* L[0] ^ L[1] ^ L[2] */
vpxor %xmm5, %xmm3, %xmm11; /* L[0] ^ L[2] ^ L[3] */
vpxor %xmm7, %xmm3, %xmm12; /* L[1] ^ L[2] ^ L[3] */
vpxor %xmm0, %xmm8, %xmm13; /* L[0] ^ L[1] ^ L[3] */
vpxor %xmm4, %xmm9, %xmm14; /* L[0] ^ L[1] ^ L[2] ^ L[3] */
vinserti128 $1, %xmm4, %ymm0, %ymm0;
vinserti128 $1, %xmm7, %ymm1, %ymm1;
vinserti128 $1, %xmm5, %ymm10, %ymm10;
vinserti128 $1, %xmm9, %ymm2, %ymm2;
vinserti128 $1, %xmm14, %ymm11, %ymm11;
vinserti128 $1, %xmm8, %ymm12, %ymm12;
vinserti128 $1, %xmm6, %ymm13, %ymm13;
vmovdqa %ymm0, (0 * 16)(%rsp);
vmovdqa %ymm1, (2 * 16)(%rsp);
vmovdqa %ymm10, (4 * 16)(%rsp);
vmovdqa %ymm2, (6 * 16)(%rsp);
vmovdqa %ymm11, (8 * 16)(%rsp);
vmovdqa %ymm12, (10 * 16)(%rsp);
vmovdqa %ymm13, (12 * 16)(%rsp);
/* Aligned: Process 16 blocks per loop. */
.align 8
.Locb_aligned_blk16:
cmpq $16, %r8;
jb .Locb_aligned_blk8;
leaq -16(%r8), %r8;
leal 16(%esi), %esi;
tzcntl %esi, %eax;
shll $4, %eax;
vpxor (0 * 16)(%rsp), %ymm15, %ymm8;
vpxor (2 * 16)(%rsp), %ymm15, %ymm9;
vpxor (4 * 16)(%rsp), %ymm15, %ymm10;
vpxor (6 * 16)(%rsp), %ymm15, %ymm11;
vpxor (8 * 16)(%rsp), %ymm15, %ymm12;
vpxor (3 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[3] */
vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */
vinserti128 $1, %xmm14, %ymm13, %ymm14;
testl %r15d, %r15d;
jz .Locb_aligned_blk16_dec;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
vpxor (8 * 16)(%rcx), %ymm0, %ymm4;
vpxor (10 * 16)(%rcx), %ymm1, %ymm5;
vpxor (12 * 16)(%rcx), %ymm2, %ymm6;
vpxor (14 * 16)(%rcx), %ymm3, %ymm7;
vpxor %ymm4, %ymm5, %ymm5;
vpxor %ymm6, %ymm7, %ymm7;
vpxor %ymm5, %ymm7, %ymm7;
vpxor (20 * 16)(%rsp), %ymm7, %ymm7;
vmovdqa %ymm7, (20 * 16)(%rsp);
vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
vpxor %ymm8, %ymm0, %ymm0;
vpxor %ymm9, %ymm1, %ymm1;
vpxor %ymm10, %ymm2, %ymm2;
vpxor %ymm11, %ymm3, %ymm3;
vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
vmovdqa %ymm13, (16 * 16)(%rsp);
vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
vmovdqa %ymm13, (18 * 16)(%rsp);
leaq (16 * 16)(%rcx), %rcx;
vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
cmpl $12, %r9d;
jb .Locb_aligned_blk16_enc_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
jz .Locb_aligned_blk16_enc_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
/* Last round and output handling. */
.Locb_aligned_blk16_enc_last:
vmovdqa (14 * 16)(%rsp), %ymm13;
vpxor %ymm8, %ymm13, %ymm8;
vpxor %ymm9, %ymm13, %ymm9;
vpxor %ymm10, %ymm13, %ymm10;
vpxor %ymm11, %ymm13, %ymm11;
vaesenclast %ymm8, %ymm0, %ymm0;
vaesenclast %ymm9, %ymm1, %ymm1;
vaesenclast %ymm10, %ymm2, %ymm2;
vaesenclast %ymm11, %ymm3, %ymm3;
vpxor %ymm12, %ymm13, %ymm12;
vpxor (16 * 16)(%rsp), %ymm13, %ymm8;
vpxor (18 * 16)(%rsp), %ymm13, %ymm9;
vpxor %ymm14, %ymm13, %ymm13;
vaesenclast %ymm12, %ymm4, %ymm4;
vaesenclast %ymm8, %ymm5, %ymm5;
vaesenclast %ymm9, %ymm6, %ymm6;
vaesenclast %ymm13, %ymm7, %ymm7;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
leaq (16 * 16)(%rdx), %rdx;
jmp .Locb_aligned_blk16;
.align 8
.Locb_aligned_blk16_dec:
vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
vmovdqa %ymm13, (16 * 16)(%rsp);
vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
vmovdqa %ymm13, (18 * 16)(%rsp);
leaq (16 * 16)(%rcx), %rcx;
vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
cmpl $12, %r9d;
jb .Locb_aligned_blk16_dec_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
jz .Locb_aligned_blk16_dec_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
/* Last round and output handling. */
.Locb_aligned_blk16_dec_last:
vmovdqa (14 * 16)(%rsp), %ymm13;
vpxor %ymm8, %ymm13, %ymm8;
vpxor %ymm9, %ymm13, %ymm9;
vpxor %ymm10, %ymm13, %ymm10;
vpxor %ymm11, %ymm13, %ymm11;
vaesdeclast %ymm8, %ymm0, %ymm0;
vaesdeclast %ymm9, %ymm1, %ymm1;
vaesdeclast %ymm10, %ymm2, %ymm2;
vaesdeclast %ymm11, %ymm3, %ymm3;
vpxor %ymm12, %ymm13, %ymm12;
vpxor (16 * 16)(%rsp), %ymm13, %ymm8;
vpxor (18 * 16)(%rsp), %ymm13, %ymm9;
vpxor %ymm14, %ymm13, %ymm13;
vaesdeclast %ymm12, %ymm4, %ymm4;
vaesdeclast %ymm8, %ymm5, %ymm5;
vaesdeclast %ymm9, %ymm6, %ymm6;
vaesdeclast %ymm13, %ymm7, %ymm7;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
vpxor %ymm1, %ymm0, %ymm0;
vpxor %ymm3, %ymm2, %ymm2;
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
vpxor %ymm5, %ymm4, %ymm4;
vpxor %ymm7, %ymm6, %ymm6;
leaq (16 * 16)(%rdx), %rdx;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm6, %ymm2, %ymm2;
vpxor %ymm2, %ymm0, %ymm0;
vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
vmovdqa %ymm0, (20 * 16)(%rsp);
jmp .Locb_aligned_blk16;
/* Aligned: Process trailing eight blocks. */
.align 8
.Locb_aligned_blk8:
cmpq $8, %r8;
jb .Locb_aligned_done;
leaq -8(%r8), %r8;
leal 8(%esi), %esi;
tzcntl %esi, %eax;
shll $4, %eax;
vpxor (0 * 16)(%rsp), %ymm15, %ymm5;
vpxor (2 * 16)(%rsp), %ymm15, %ymm6;
vpxor (4 * 16)(%rsp), %ymm15, %ymm7;
vpxor (2 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[2] */
vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */
vinserti128 $1, %xmm14, %ymm13, %ymm14;
testl %r15d, %r15d;
jz .Locb_aligned_blk8_dec;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
vpxor %ymm2, %ymm0, %ymm10;
vpxor %ymm3, %ymm1, %ymm11;
vpxor %ymm11, %ymm10, %ymm10;
vpxor (20 * 16)(%rsp), %ymm10, %ymm10;
vmovdqa %ymm10, (20 * 16)(%rsp);
vpxor %ymm5, %ymm0, %ymm0;
vpxor %ymm6, %ymm1, %ymm1;
vpxor %ymm7, %ymm2, %ymm2;
vpxor %ymm14, %ymm3, %ymm3;
leaq (8 * 16)(%rcx), %rcx;
vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
vmovdqa (14 * 16)(%rsp), %ymm8;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
cmpl $12, %r9d;
jb .Locb_aligned_blk8_enc_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
jz .Locb_aligned_blk8_enc_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
/* Last round and output handling. */
.Locb_aligned_blk8_enc_last:
vpxor %ymm5, %ymm8, %ymm5;
vpxor %ymm6, %ymm8, %ymm6;
vpxor %ymm7, %ymm8, %ymm7;
vpxor %ymm14, %ymm8, %ymm4;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vaesenclast %ymm7, %ymm2, %ymm2;
vaesenclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
jmp .Locb_aligned_done;
.align 8
.Locb_aligned_blk8_dec:
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
leaq (8 * 16)(%rcx), %rcx;
vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
vmovdqa (14 * 16)(%rsp), %ymm8;
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
cmpl $12, %r9d;
jb .Locb_aligned_blk8_dec_last;
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
jz .Locb_aligned_blk8_dec_last;
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Locb_aligned_blk8_dec_last:
vpxor %ymm5, %ymm8, %ymm5;
vpxor %ymm6, %ymm8, %ymm6;
vpxor %ymm7, %ymm8, %ymm7;
vpxor %ymm14, %ymm8, %ymm4;
vaesdeclast %ymm5, %ymm0, %ymm0;
vaesdeclast %ymm6, %ymm1, %ymm1;
vaesdeclast %ymm7, %ymm2, %ymm2;
vaesdeclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
vpxor %ymm1, %ymm0, %ymm0;
vpxor %ymm3, %ymm2, %ymm2;
vpxor %ymm2, %ymm0, %ymm0;
vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
vmovdqa %ymm0, (20 * 16)(%rsp);
.align 8
.Locb_aligned_done:
vmovdqa (20 * 16)(%rsp), %ymm14;
vpxor %xmm13, %xmm13, %xmm13;
/* Burn stack. */
vmovdqa %ymm13, (0 * 16)(%rsp);
vmovdqa %ymm13, (2 * 16)(%rsp);
vmovdqa %ymm13, (4 * 16)(%rsp);
vmovdqa %ymm13, (6 * 16)(%rsp);
vmovdqa %ymm13, (8 * 16)(%rsp);
vmovdqa %ymm13, (10 * 16)(%rsp);
vmovdqa %ymm13, (12 * 16)(%rsp);
vmovdqa %ymm13, (16 * 16)(%rsp);
vmovdqa %ymm13, (18 * 16)(%rsp);
vmovdqa %ymm13, (20 * 16)(%rsp);
/* Handle tailing 1…7 blocks in nblk-unaligned loop. */
movq %r8, %r10;
cmpq $1, %r8;
jnb .Locb_unaligned_blk8;
.align 8
.Ldone_ocb:
vpxor %ymm13, %ymm14, %ymm14;
vextracti128 $1, %ymm14, %xmm13;
vpxor (%rbx), %xmm14, %xmm14;
vpxor %xmm13, %xmm14, %xmm14;
vmovdqu %xmm14, (%rbx);
movq OFFSET_PTR_Q, %r14; /* offset ptr. */
vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key ^ first key */
vmovdqu %xmm15, (%r14); /* Store offset. */
/* Burn stack. */
vpxor %ymm0, %ymm0, %ymm0;
vmovdqa %ymm0, (14 * 16)(%rsp);
vzeroall;
movq (STACK_REGS_POS + 0 * 8)(%rsp), %r12;
CFI_RESTORE(%r12);
movq (STACK_REGS_POS + 1 * 8)(%rsp), %r13;
CFI_RESTORE(%r13);
movq (STACK_REGS_POS + 2 * 8)(%rsp), %r14;
CFI_RESTORE(%r14);
movq (STACK_REGS_POS + 3 * 8)(%rsp), %r15;
CFI_RESTORE(%r15);
movq (STACK_REGS_POS + 4 * 8)(%rsp), %rbx;
CFI_RESTORE(%rbx);
leave;
CFI_LEAVE();
ret_spec_stop
#undef STACK_REGS_POS
#undef STACK_ALLOC
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64)
/**********************************************************************
XTS-mode encryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_xts_crypt_amd64,@function)
.globl _gcry_vaes_avx2_xts_crypt_amd64
.align 16
_gcry_vaes_avx2_xts_crypt_amd64:
/* input:
* %rdi: round keys
* %rsi: tweak
* %rdx: dst
* %rcx: src
* %r8: nblocks
* %r9: nrounds
* 8(%rsp): encrypt
*/
CFI_STARTPROC();
movl 8(%rsp), %eax;
#define tweak_clmul(shift, out, tweak, hi_tweak, tmp1, tmp2) \
vpsrld $(32-(shift)), hi_tweak, tmp2; \
vpsllq $(shift), tweak, out; \
vpclmulqdq $0, .Lxts_gfmul_clmul rRIP, tmp2, tmp1; \
vpunpckhqdq tmp2, tmp1, tmp1; \
vpxor tmp1, out, out;
/* Prepare tweak. */
vmovdqu (%rsi), %xmm15;
vpshufb .Lxts_high_bit_shuf rRIP, %xmm15, %xmm13;
tweak_clmul(1, %xmm11, %xmm15, %xmm13, %xmm0, %xmm1);
vinserti128 $1, %xmm11, %ymm15, %ymm15; /* tweak:tweak1 */
vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13;
cmpq $8, %r8;
jb .Lxts_crypt_blk4;
/* Process eight blocks per loop. */
leaq -8(%r8), %r8;
vmovdqa %ymm15, %ymm5;
tweak_clmul(2, %ymm6, %ymm15, %ymm13, %ymm0, %ymm1);
tweak_clmul(4, %ymm7, %ymm15, %ymm13, %ymm0, %ymm1);
tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm0, %ymm1);
tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm0, %ymm1);
vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
leaq (8 * 16)(%rcx), %rcx;
.align 8
.Lxts_crypt_blk8_loop:
cmpq $8, %r8;
jb .Lxts_crypt_blk8_tail;
leaq -8(%r8), %r8;
testl %eax, %eax;
jz .Lxts_dec_blk8;
/* AES rounds */
XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vmovdqa %ymm15, %ymm9;
tweak_clmul(2, %ymm10, %ymm15, %ymm13, %ymm12, %ymm14);
tweak_clmul(4, %ymm11, %ymm15, %ymm13, %ymm12, %ymm14);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lxts_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lxts_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lxts_enc_blk8_last:
vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */
vpxor %ymm4, %ymm6, %ymm6;
vpxor %ymm4, %ymm7, %ymm7;
vpxor %ymm4, %ymm8, %ymm4;
tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm12, %ymm14);
tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm12, %ymm14);
vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vaesenclast %ymm7, %ymm2, %ymm2;
vaesenclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vpxor (0 * 16)(%rcx), %ymm9, %ymm0;
vpxor (2 * 16)(%rcx), %ymm10, %ymm1;
vpxor (4 * 16)(%rcx), %ymm11, %ymm2;
vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
vmovdqa %ymm9, %ymm5;
vmovdqa %ymm10, %ymm6;
vmovdqa %ymm11, %ymm7;
leaq (8 * 16)(%rcx), %rcx;
jmp .Lxts_crypt_blk8_loop;
.align 8
.Lxts_dec_blk8:
/* AES rounds */
XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vmovdqa %ymm15, %ymm9;
tweak_clmul(2, %ymm10, %ymm15, %ymm13, %ymm12, %ymm14);
tweak_clmul(4, %ymm11, %ymm15, %ymm13, %ymm12, %ymm14);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lxts_dec_blk8_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lxts_dec_blk8_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lxts_dec_blk8_last:
vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */
vpxor %ymm4, %ymm6, %ymm6;
vpxor %ymm4, %ymm7, %ymm7;
vpxor %ymm4, %ymm8, %ymm4;
tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm12, %ymm14);
tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm12, %ymm14);
vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13;
vaesdeclast %ymm5, %ymm0, %ymm0;
vaesdeclast %ymm6, %ymm1, %ymm1;
vaesdeclast %ymm7, %ymm2, %ymm2;
vaesdeclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vpxor (0 * 16)(%rcx), %ymm9, %ymm0;
vpxor (2 * 16)(%rcx), %ymm10, %ymm1;
vpxor (4 * 16)(%rcx), %ymm11, %ymm2;
vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
vmovdqa %ymm9, %ymm5;
vmovdqa %ymm10, %ymm6;
vmovdqa %ymm11, %ymm7;
leaq (8 * 16)(%rcx), %rcx;
jmp .Lxts_crypt_blk8_loop;
.align 8
.Lxts_crypt_blk8_tail:
testl %eax, %eax;
jz .Lxts_dec_tail_blk8;
/* AES rounds */
XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lxts_enc_blk8_tail_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lxts_enc_blk8_tail_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lxts_enc_blk8_tail_last:
vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */
vpxor %ymm4, %ymm6, %ymm6;
vpxor %ymm4, %ymm7, %ymm7;
vpxor %ymm4, %ymm8, %ymm4;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vaesenclast %ymm7, %ymm2, %ymm2;
vaesenclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
jmp .Lxts_crypt_blk4;
.align 8
.Lxts_dec_tail_blk8:
/* AES rounds */
XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lxts_dec_blk8_tail_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lxts_dec_blk8_tail_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lxts_dec_blk8_tail_last:
vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */
vpxor %ymm4, %ymm6, %ymm6;
vpxor %ymm4, %ymm7, %ymm7;
vpxor %ymm4, %ymm8, %ymm4;
vaesdeclast %ymm5, %ymm0, %ymm0;
vaesdeclast %ymm6, %ymm1, %ymm1;
vaesdeclast %ymm7, %ymm2, %ymm2;
vaesdeclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
/* Handle trailing four blocks. */
.align 8
.Lxts_crypt_blk4:
/* Try exit early as typically input length is large power of 2. */
- cmpq $0, %r8;
+ cmpq $1, %r8;
jb .Ldone_xts_crypt;
cmpq $4, %r8;
jb .Lxts_crypt_blk1;
leaq -4(%r8), %r8;
vmovdqa %ymm15, %ymm5;
tweak_clmul(2, %ymm6, %ymm15, %ymm13, %ymm0, %ymm1);
tweak_clmul(4, %ymm15, %ymm15, %ymm13, %ymm0, %ymm1);
vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13;
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
leaq (4 * 16)(%rcx), %rcx;
testl %eax, %eax;
jz .Lxts_dec_blk4;
/* AES rounds */
XOR2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lxts_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lxts_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lxts_enc_blk4_last:
vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */
vpxor %ymm4, %ymm6, %ymm6;
vaesenclast %ymm5, %ymm0, %ymm0;
vaesenclast %ymm6, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
jmp .Lxts_crypt_blk1;
.align 8
.Lxts_dec_blk4:
/* AES rounds */
XOR2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lxts_dec_blk4_last;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lxts_dec_blk4_last;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
/* Last round and output handling. */
.Lxts_dec_blk4_last:
vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */
vpxor %ymm4, %ymm6, %ymm6;
vaesdeclast %ymm5, %ymm0, %ymm0;
vaesdeclast %ymm6, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
/* Process trailing one to three blocks, one per loop. */
.align 8
.Lxts_crypt_blk1:
cmpq $1, %r8;
jb .Ldone_xts_crypt;
leaq -1(%r8), %r8;
vpxor (%rcx), %xmm15, %xmm0;
vmovdqa %xmm15, %xmm5;
tweak_clmul(1, %xmm15, %xmm15, %xmm13, %xmm2, %xmm3);
vpshufb .Lxts_high_bit_shuf rRIP, %xmm15, %xmm13;
leaq 16(%rcx), %rcx;
testl %eax, %eax;
jz .Lxts_dec_blk1;
/* AES rounds. */
vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lxts_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lxts_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
/* Last round and output handling. */
.Lxts_enc_blk1_last:
vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */
vaesenclast %xmm5, %xmm0, %xmm0;
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Lxts_crypt_blk1;
.align 8
.Lxts_dec_blk1:
/* AES rounds. */
vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lxts_dec_blk1_last;
vaesdec %xmm1, %xmm0, %xmm0;
vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lxts_dec_blk1_last;
vaesdec %xmm1, %xmm0, %xmm0;
vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
/* Last round and output handling. */
.Lxts_dec_blk1_last:
vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */
vaesdeclast %xmm5, %xmm0, %xmm0;
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Lxts_crypt_blk1;
.align 8
.Ldone_xts_crypt:
/* Store IV. */
vmovdqu %xmm15, (%rsi);
vzeroall;
xorl %eax, %eax
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64)
/**********************************************************************
ECB-mode encryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64,@function)
.globl _gcry_vaes_avx2_ecb_crypt_amd64
.align 16
_gcry_vaes_avx2_ecb_crypt_amd64:
/* input:
* %rdi: round keys
* %esi: encrypt
* %rdx: dst
* %rcx: src
* %r8: nblocks
* %r9: nrounds
*/
CFI_STARTPROC();
/* Process 16 blocks per loop. */
.align 8
.Lecb_blk16:
cmpq $16, %r8;
jb .Lecb_blk8;
leaq -16(%r8), %r8;
/* Load input and xor first key. */
vbroadcasti128 (0 * 16)(%rdi), %ymm8;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
vmovdqu (8 * 16)(%rcx), %ymm4;
vmovdqu (10 * 16)(%rcx), %ymm5;
vmovdqu (12 * 16)(%rcx), %ymm6;
vmovdqu (14 * 16)(%rcx), %ymm7;
vpxor %ymm8, %ymm0, %ymm0;
vpxor %ymm8, %ymm1, %ymm1;
vpxor %ymm8, %ymm2, %ymm2;
vpxor %ymm8, %ymm3, %ymm3;
vpxor %ymm8, %ymm4, %ymm4;
vpxor %ymm8, %ymm5, %ymm5;
vpxor %ymm8, %ymm6, %ymm6;
vpxor %ymm8, %ymm7, %ymm7;
vbroadcasti128 (1 * 16)(%rdi), %ymm8;
leaq (16 * 16)(%rcx), %rcx;
testl %esi, %esi;
jz .Lecb_dec_blk16;
/* AES rounds */
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (10 * 16)(%rdi), %ymm8;
cmpl $12, %r9d;
jb .Lecb_enc_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (12 * 16)(%rdi), %ymm8;
jz .Lecb_enc_blk16_last;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm8;
VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (14 * 16)(%rdi), %ymm8;
.Lecb_enc_blk16_last:
vaesenclast %ymm8, %ymm0, %ymm0;
vaesenclast %ymm8, %ymm1, %ymm1;
vaesenclast %ymm8, %ymm2, %ymm2;
vaesenclast %ymm8, %ymm3, %ymm3;
vaesenclast %ymm8, %ymm4, %ymm4;
vaesenclast %ymm8, %ymm5, %ymm5;
vaesenclast %ymm8, %ymm6, %ymm6;
vaesenclast %ymm8, %ymm7, %ymm7;
jmp .Lecb_blk16_end;
.align 8
.Lecb_dec_blk16:
/* AES rounds */
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (2 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (3 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (4 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (5 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (6 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (7 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (8 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (9 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (10 * 16)(%rdi), %ymm8;
cmpl $12, %r9d;
jb .Lecb_dec_blk16_last;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (11 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (12 * 16)(%rdi), %ymm8;
jz .Lecb_dec_blk16_last;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (13 * 16)(%rdi), %ymm8;
VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
vbroadcasti128 (14 * 16)(%rdi), %ymm8;
.Lecb_dec_blk16_last:
vaesdeclast %ymm8, %ymm0, %ymm0;
vaesdeclast %ymm8, %ymm1, %ymm1;
vaesdeclast %ymm8, %ymm2, %ymm2;
vaesdeclast %ymm8, %ymm3, %ymm3;
vaesdeclast %ymm8, %ymm4, %ymm4;
vaesdeclast %ymm8, %ymm5, %ymm5;
vaesdeclast %ymm8, %ymm6, %ymm6;
vaesdeclast %ymm8, %ymm7, %ymm7;
jmp .Lecb_blk16_end;
.align 8
.Lecb_blk16_end:
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
leaq (16 * 16)(%rdx), %rdx;
jmp .Lecb_blk16;
/* Handle trailing eight blocks. */
.align 8
.Lecb_blk8:
cmpq $8, %r8;
jmp .Lecb_blk4;
leaq -8(%r8), %r8;
/* Load input and xor first key. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vpxor %ymm4, %ymm2, %ymm2;
vpxor %ymm4, %ymm3, %ymm3;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
leaq (8 * 16)(%rcx), %rcx;
testl %esi, %esi;
jz .Lecb_dec_blk8;
/* AES rounds */
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lecb_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lecb_enc_blk8_last;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
.Lecb_enc_blk8_last:
vaesenclast %ymm4, %ymm0, %ymm0;
vaesenclast %ymm4, %ymm1, %ymm1;
vaesenclast %ymm4, %ymm2, %ymm2;
vaesenclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
jmp .Lecb_blk4;
.align 8
.Lecb_dec_blk8:
/* AES rounds */
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lecb_dec_blk8_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lecb_dec_blk8_last;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
.Lecb_dec_blk8_last:
vaesdeclast %ymm4, %ymm0, %ymm0;
vaesdeclast %ymm4, %ymm1, %ymm1;
vaesdeclast %ymm4, %ymm2, %ymm2;
vaesdeclast %ymm4, %ymm3, %ymm3;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
/* Handle trailing four blocks. */
.align 8
.Lecb_blk4:
cmpq $4, %r8;
jb .Lecb_blk1;
leaq -4(%r8), %r8;
/* Load input and xor first key. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
leaq (4 * 16)(%rcx), %rcx;
testl %esi, %esi;
jz .Lecb_dec_blk4;
/* AES rounds */
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lecb_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lecb_enc_blk4_last;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
.Lecb_enc_blk4_last:
vaesenclast %ymm4, %ymm0, %ymm0;
vaesenclast %ymm4, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
jmp .Lecb_blk1;
.align 8
.Lecb_dec_blk4:
/* AES rounds */
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (2 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (3 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (4 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (5 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (6 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (7 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (8 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (9 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (10 * 16)(%rdi), %ymm4;
cmpl $12, %r9d;
jb .Lecb_dec_blk4_last;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (11 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (12 * 16)(%rdi), %ymm4;
jz .Lecb_dec_blk4_last;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (13 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
vbroadcasti128 (14 * 16)(%rdi), %ymm4;
.Lecb_dec_blk4_last:
vaesdeclast %ymm4, %ymm0, %ymm0;
vaesdeclast %ymm4, %ymm1, %ymm1;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
/* Process trailing one to three blocks, one per loop. */
.align 8
.Lecb_blk1:
cmpq $1, %r8;
jb .Ldone_ecb;
leaq -1(%r8), %r8;
/* Load input. */
vmovdqu (%rcx), %xmm2;
leaq 16(%rcx), %rcx;
/* Xor first key. */
vpxor (0 * 16)(%rdi), %xmm2, %xmm0;
testl %esi, %esi;
jz .Lecb_dec_blk1;
/* AES rounds. */
vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lecb_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lecb_enc_blk1_last;
vaesenc %xmm1, %xmm0, %xmm0;
vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
.Lecb_enc_blk1_last:
vaesenclast %xmm1, %xmm0, %xmm0;
jmp .Lecb_blk1_end;
.align 8
.Lecb_dec_blk1:
/* AES rounds. */
vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (10 * 16)(%rdi), %xmm1;
cmpl $12, %r9d;
jb .Lecb_dec_blk1_last;
vaesdec %xmm1, %xmm0, %xmm0;
vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (12 * 16)(%rdi), %xmm1;
jz .Lecb_dec_blk1_last;
vaesdec %xmm1, %xmm0, %xmm0;
vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
vmovdqa (14 * 16)(%rdi), %xmm1;
.Lecb_dec_blk1_last:
vaesdeclast %xmm1, %xmm0, %xmm0;
jmp .Lecb_blk1_end;
.align 8
.Lecb_blk1_end:
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
jmp .Lecb_blk1;
.align 8
.Ldone_ecb:
vzeroall;
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64)
/**********************************************************************
constants
**********************************************************************/
SECTION_RODATA
ELF(.type _gcry_vaes_consts,@object)
_gcry_vaes_consts:
.align 32
.Lbige_addb_0:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lbige_addb_1:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
.Lbige_addb_2:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
.Lbige_addb_3:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
.Lbige_addb_4:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
.Lbige_addb_5:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
.Lbige_addb_6:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
.Lbige_addb_7:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
.Lbige_addb_8:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
.Lbige_addb_9:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
.Lbige_addb_10:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
.Lbige_addb_11:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
.Lbige_addb_12:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
.Lbige_addb_13:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
.Lbige_addb_14:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
.Lbige_addb_15:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
.Lle_addd_0:
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_1:
.byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_2:
.byte 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_3:
.byte 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_4:
.byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_5:
.byte 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_6:
.byte 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_7:
.byte 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_8:
.byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_9:
.byte 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_10:
.byte 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_11:
.byte 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_12:
.byte 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_13:
.byte 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_14:
.byte 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_15:
.byte 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_4_2:
.byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_8_2:
.byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lle_addd_16_2:
.byte 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.byte 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.Lxts_gfmul_clmul:
.long 0x00, 0x87, 0x00, 0x00
.long 0x00, 0x87, 0x00, 0x00
.Lxts_high_bit_shuf:
.byte -1, -1, -1, -1, 12, 13, 14, 15
.byte 4, 5, 6, 7, -1, -1, -1, -1
.byte -1, -1, -1, -1, 12, 13, 14, 15
.byte 4, 5, 6, 7, -1, -1, -1, -1
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
ELF(.size _gcry_vaes_consts,.-_gcry_vaes_consts)
#endif /* HAVE_GCC_INLINE_ASM_VAES */
#endif /* __x86_64__ */

File Metadata

Mime Type
text/x-diff
Expires
Sun, Feb 23, 7:17 PM (1 h, 44 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
9c/0d/d486831b3f5dd12157449d48af9c

Event Timeline