diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S index 221dfeff..2abd90a7 100644 --- a/cipher/arcfour-amd64.S +++ b/cipher/arcfour-amd64.S @@ -1,108 +1,108 @@ /* ** RC4 implementation optimized for AMD64. ** ** Author: Marc Bevand ** Licence: I hereby disclaim the copyright on this code and place it ** in the public domain. ** ** The throughput achieved by this code is about 320 MBytes/sec, on ** a 1.8 GHz AMD Opteron (rev C0) processor. ** ** 2013/12/20 : ** - Integrated to libgcrypt ** - 4.18 cycles/byte on Intel i5-4570 */ #ifdef __x86_64__ #include #if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #include "asm-common-amd64.h" .text .align 16 .globl _gcry_arcfour_amd64 ELF(.type _gcry_arcfour_amd64,@function) _gcry_arcfour_amd64: CFI_STARTPROC() ENTER_SYSV_FUNC_PARAMS_0_4 push %rbp CFI_PUSH(%rbp) push %rbx CFI_PUSH(%rbx) mov %rdi, %rbp # key = ARG(key) mov %rsi, %rbx # rbx = ARG(len) mov %rdx, %rsi # in = ARG(in) mov %rcx, %rdi # out = ARG(out) mov (4*256)(%rbp), %ecx # x = key->x mov (4*256+4)(%rbp),%edx # y = key->y inc %rcx # x++ and $255, %rcx # x &= 0xff lea -8(%rbx,%rsi), %rbx # rbx = in+len-8 mov %rbx, %r9 # tmp = in+len-8 mov (%rbp,%rcx,4), %eax # tx = d[x] cmp %rsi, %rbx # cmp in with in+len-8 jl .Lend # jump if (in+len-8 < in) .Lstart: add $8, %rsi # increment in add $8, %rdi # increment out # generate the next 8 bytes of the rc4 stream into %r8 mov $8, %r11 # byte counter 1: add %al, %dl # y += tx mov (%rbp,%rdx,4), %ebx # ty = d[y] mov %ebx, (%rbp,%rcx,4) # d[x] = ty add %al, %bl # val = ty + tx mov %eax, (%rbp,%rdx,4) # d[y] = tx inc %cl # x++ (NEXT ROUND) mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) shl $8, %r8 movb (%rbp,%rbx,4), %r8b # val = d[val] dec %r11b jnz 1b # xor 8 bytes bswap %r8 xor -8(%rsi), %r8 cmp %r9, %rsi # cmp in+len-8 with in mov %r8, -8(%rdi) jle .Lstart # jump if (in <= in+len-8) .Lend: add $8, %r9 # tmp = in+len # handle the last bytes, one by one 1: cmp %rsi, %r9 # cmp in with in+len jle .Lfinished # jump if (in+len <= in) add %al, %dl # y += tx mov (%rbp,%rdx,4), %ebx # ty = d[y] mov %ebx, (%rbp,%rcx,4) # d[x] = ty add %al, %bl # val = ty + tx mov %eax, (%rbp,%rdx,4) # d[y] = tx inc %cl # x++ (NEXT ROUND) mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) movb (%rbp,%rbx,4), %r8b # val = d[val] xor (%rsi), %r8b # xor 1 byte movb %r8b, (%rdi) inc %rsi # in++ inc %rdi # out++ jmp 1b .Lfinished: dec %rcx # x-- movb %cl, (4*256)(%rbp) # key->y = y movb %dl, (4*256+4)(%rbp) # key->x = x pop %rbx CFI_POP(%rbx) pop %rbp CFI_POP(%rbp) EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC() .L__gcry_arcfour_amd64_end: ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64) #endif #endif diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h index 9d4a028a..97912b1b 100644 --- a/cipher/asm-common-amd64.h +++ b/cipher/asm-common-amd64.h @@ -1,189 +1,193 @@ /* asm-common-amd64.h - Common macros for AMD64 assembly * * Copyright (C) 2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef GCRY_ASM_COMMON_AMD64_H #define GCRY_ASM_COMMON_AMD64_H #include #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS # define ELF(...) __VA_ARGS__ #else # define ELF(...) /*_*/ #endif #ifdef __PIC__ # define rRIP (%rip) #else # define rRIP #endif #ifdef __PIC__ # define RIP %rip #else # define RIP #endif #ifdef __PIC__ # define ADD_RIP +rip #else # define ADD_RIP #endif #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__) # define GET_EXTERN_POINTER(name, reg) movabsq $name, reg #else # ifdef __code_model_large__ # define GET_EXTERN_POINTER(name, reg) \ pushq %r15; \ pushq %r14; \ 1: leaq 1b(%rip), reg; \ movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \ movabsq $name@GOT, %r15; \ addq %r14, reg; \ popq %r14; \ movq (reg, %r15), reg; \ popq %r15; # else # define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg # endif #endif #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES /* CFI directives to emit DWARF stack unwinding information. */ # define CFI_STARTPROC() .cfi_startproc # define CFI_ENDPROC() .cfi_endproc # define CFI_REMEMBER_STATE() .cfi_remember_state # define CFI_RESTORE_STATE() .cfi_restore_state # define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off # define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off # define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg # define CFI_REGISTER(ro,rn) .cfi_register ro, rn # define CFI_RESTORE(reg) .cfi_restore reg # define CFI_PUSH(reg) \ CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0) # define CFI_POP(reg) \ CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg) # define CFI_POP_TMP_REG() \ CFI_ADJUST_CFA_OFFSET(-8); # define CFI_LEAVE() \ CFI_ADJUST_CFA_OFFSET(-8); CFI_DEF_CFA_REGISTER(%rsp) /* CFA expressions are used for pointing CFA and registers to * %rsp relative offsets. */ # define DW_REGNO_rax 0 # define DW_REGNO_rdx 1 # define DW_REGNO_rcx 2 # define DW_REGNO_rbx 3 # define DW_REGNO_rsi 4 # define DW_REGNO_rdi 5 # define DW_REGNO_rbp 6 # define DW_REGNO_rsp 7 # define DW_REGNO_r8 8 # define DW_REGNO_r9 9 # define DW_REGNO_r10 10 # define DW_REGNO_r11 11 # define DW_REGNO_r12 12 # define DW_REGNO_r13 13 # define DW_REGNO_r14 14 # define DW_REGNO_r15 15 # define DW_REGNO(reg) DW_REGNO_ ## reg /* Fixed length encoding used for integers for now. */ # define DW_SLEB128_7BIT(value) \ 0x00|((value) & 0x7f) # define DW_SLEB128_28BIT(value) \ 0x80|((value)&0x7f), \ 0x80|(((value)>>7)&0x7f), \ 0x80|(((value)>>14)&0x7f), \ 0x00|(((value)>>21)&0x7f) # define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \ .cfi_escape \ 0x0f, /* DW_CFA_def_cfa_expression */ \ DW_SLEB128_7BIT(11), /* length */ \ 0x77, /* DW_OP_breg7, rsp + constant */ \ DW_SLEB128_28BIT(rsp_offs), \ 0x06, /* DW_OP_deref */ \ 0x23, /* DW_OP_plus_constu */ \ DW_SLEB128_28BIT((cfa_depth)+8) # define CFI_REG_ON_STACK(reg,rsp_offs) \ .cfi_escape \ 0x10, /* DW_CFA_expression */ \ DW_SLEB128_7BIT(DW_REGNO(reg)), \ DW_SLEB128_7BIT(5), /* length */ \ 0x77, /* DW_OP_breg7, rsp + constant */ \ DW_SLEB128_28BIT(rsp_offs) #else # define CFI_STARTPROC() # define CFI_ENDPROC() # define CFI_REMEMBER_STATE() # define CFI_RESTORE_STATE() # define CFI_ADJUST_CFA_OFFSET(off) # define CFI_REL_OFFSET(reg,off) # define CFI_DEF_CFA_REGISTER(reg) # define CFI_REGISTER(ro,rn) # define CFI_RESTORE(reg) # define CFI_PUSH(reg) # define CFI_POP(reg) # define CFI_POP_TMP_REG() # define CFI_LEAVE() # define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) # define CFI_REG_ON_STACK(reg,rsp_offs) #endif #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define ENTER_SYSV_FUNC_PARAMS_0_4 \ pushq %rdi; \ CFI_PUSH(%rdi); \ pushq %rsi; \ CFI_PUSH(%rsi); \ movq %rcx, %rdi; \ movq %rdx, %rsi; \ movq %r8, %rdx; \ movq %r9, %rcx; \ # define ENTER_SYSV_FUNC_PARAMS_5 \ ENTER_SYSV_FUNC_PARAMS_0_4; \ movq 0x38(%rsp), %r8; # define ENTER_SYSV_FUNC_PARAMS_6 \ ENTER_SYSV_FUNC_PARAMS_5; \ movq 0x40(%rsp), %r9; # define EXIT_SYSV_FUNC \ popq %rsi; \ CFI_POP(%rsi); \ popq %rdi; \ CFI_POP(%rdi); #else # define ENTER_SYSV_FUNC_PARAMS_0_4 # define ENTER_SYSV_FUNC_PARAMS_5 # define ENTER_SYSV_FUNC_PARAMS_6 # define EXIT_SYSV_FUNC #endif +/* 'ret' instruction replacement for straight-line speculation mitigation */ +#define ret_spec_stop \ + ret; int3; + #endif /* GCRY_ASM_COMMON_AMD64_H */ diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S index 357e8a51..3601b65f 100644 --- a/cipher/blake2b-amd64-avx2.S +++ b/cipher/blake2b-amd64-avx2.S @@ -1,300 +1,300 @@ /* blake2b-amd64-avx2.S - AVX2 implementation of BLAKE2b * * Copyright (C) 2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* The code is based on public-domain/CC0 BLAKE2 reference implementation * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse * Copyright 2012, Samuel Neves */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_AVX2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #include "asm-common-amd64.h" .text /* register macros */ #define RSTATE %rdi #define RINBLKS %rsi #define RNBLKS %rdx #define RIV %rcx /* state structure */ #define STATE_H 0 #define STATE_T (STATE_H + 8 * 8) #define STATE_F (STATE_T + 2 * 8) /* vector registers */ #define ROW1 %ymm0 #define ROW2 %ymm1 #define ROW3 %ymm2 #define ROW4 %ymm3 #define TMP1 %ymm4 #define TMP1x %xmm4 #define R16 %ymm5 #define R24 %ymm6 #define MA1 %ymm8 #define MA2 %ymm9 #define MA3 %ymm10 #define MA4 %ymm11 #define MA1x %xmm8 #define MA2x %xmm9 #define MA3x %xmm10 #define MA4x %xmm11 #define MB1 %ymm12 #define MB2 %ymm13 #define MB3 %ymm14 #define MB4 %ymm15 #define MB1x %xmm12 #define MB2x %xmm13 #define MB3x %xmm14 #define MB4x %xmm15 /********************************************************************** blake2b/AVX2 **********************************************************************/ #define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ s0, s1, s2, s3, s4, s5, s6, s7, s8, \ s9, s10, s11, s12, s13, s14, s15) \ vmovq (s0)*8(RINBLKS), m1x; \ vmovq (s4)*8(RINBLKS), TMP1x; \ vpinsrq $1, (s2)*8(RINBLKS), m1x, m1x; \ vpinsrq $1, (s6)*8(RINBLKS), TMP1x, TMP1x; \ vinserti128 $1, TMP1x, m1, m1; \ vmovq (s1)*8(RINBLKS), m2x; \ vmovq (s5)*8(RINBLKS), TMP1x; \ vpinsrq $1, (s3)*8(RINBLKS), m2x, m2x; \ vpinsrq $1, (s7)*8(RINBLKS), TMP1x, TMP1x; \ vinserti128 $1, TMP1x, m2, m2; \ vmovq (s8)*8(RINBLKS), m3x; \ vmovq (s12)*8(RINBLKS), TMP1x; \ vpinsrq $1, (s10)*8(RINBLKS), m3x, m3x; \ vpinsrq $1, (s14)*8(RINBLKS), TMP1x, TMP1x; \ vinserti128 $1, TMP1x, m3, m3; \ vmovq (s9)*8(RINBLKS), m4x; \ vmovq (s13)*8(RINBLKS), TMP1x; \ vpinsrq $1, (s11)*8(RINBLKS), m4x, m4x; \ vpinsrq $1, (s15)*8(RINBLKS), TMP1x, TMP1x; \ vinserti128 $1, TMP1x, m4, m4; #define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) #define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3) #define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4) #define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8) #define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13) #define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9) #define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11) #define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10) #define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5) #define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0) #define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) #define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) #define LOAD_MSG(r, m1, m2, m3, m4) \ LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x) #define ROR_32(in, out) vpshufd $0xb1, in, out; #define ROR_24(in, out) vpshufb R24, in, out; #define ROR_16(in, out) vpshufb R16, in, out; #define ROR_63(in, out) \ vpsrlq $63, in, TMP1; \ vpaddq in, in, out; \ vpxor TMP1, out, out; #define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \ vpaddq m, r1, r1; \ vpaddq r2, r1, r1; \ vpxor r1, r4, r4; \ ROR_A(r4, r4); \ vpaddq r4, r3, r3; \ vpxor r3, r2, r2; \ ROR_B(r2, r2); #define G1(r1, r2, r3, r4, m) \ G(r1, r2, r3, r4, m, ROR_32, ROR_24); #define G2(r1, r2, r3, r4, m) \ G(r1, r2, r3, r4, m, ROR_16, ROR_63); #define MM_SHUFFLE(z,y,x,w) \ (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) #define DIAGONALIZE(r1, r2, r3, r4) \ vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \ vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \ vpermq $MM_SHUFFLE(2,1,0,3), r4, r4; #define UNDIAGONALIZE(r1, r2, r3, r4) \ vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \ vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \ vpermq $MM_SHUFFLE(0,3,2,1), r4, r4; #define ROUND(r, m1, m2, m3, m4) \ G1(ROW1, ROW2, ROW3, ROW4, m1); \ G2(ROW1, ROW2, ROW3, ROW4, m2); \ DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \ G1(ROW1, ROW2, ROW3, ROW4, m3); \ G2(ROW1, ROW2, ROW3, ROW4, m4); \ UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4); blake2b_data: .align 32 .Liv: .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 .Lshuf_ror16: .byte 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 .Lshuf_ror24: .byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 .align 64 .globl _gcry_blake2b_transform_amd64_avx2 ELF(.type _gcry_blake2b_transform_amd64_avx2,@function;) _gcry_blake2b_transform_amd64_avx2: /* input: * %rdi: state * %rsi: blks * %rdx: num_blks */ CFI_STARTPROC(); vzeroupper; addq $128, (STATE_T + 0)(RSTATE); adcq $0, (STATE_T + 8)(RSTATE); vbroadcasti128 .Lshuf_ror16 rRIP, R16; vbroadcasti128 .Lshuf_ror24 rRIP, R24; vmovdqa .Liv+(0 * 8) rRIP, ROW3; vmovdqa .Liv+(4 * 8) rRIP, ROW4; vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1; vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2; vpxor (STATE_T)(RSTATE), ROW4, ROW4; LOAD_MSG(0, MA1, MA2, MA3, MA4); LOAD_MSG(1, MB1, MB2, MB3, MB4); .Loop: ROUND(0, MA1, MA2, MA3, MA4); LOAD_MSG(2, MA1, MA2, MA3, MA4); ROUND(1, MB1, MB2, MB3, MB4); LOAD_MSG(3, MB1, MB2, MB3, MB4); ROUND(2, MA1, MA2, MA3, MA4); LOAD_MSG(4, MA1, MA2, MA3, MA4); ROUND(3, MB1, MB2, MB3, MB4); LOAD_MSG(5, MB1, MB2, MB3, MB4); ROUND(4, MA1, MA2, MA3, MA4); LOAD_MSG(6, MA1, MA2, MA3, MA4); ROUND(5, MB1, MB2, MB3, MB4); LOAD_MSG(7, MB1, MB2, MB3, MB4); ROUND(6, MA1, MA2, MA3, MA4); LOAD_MSG(8, MA1, MA2, MA3, MA4); ROUND(7, MB1, MB2, MB3, MB4); LOAD_MSG(9, MB1, MB2, MB3, MB4); ROUND(8, MA1, MA2, MA3, MA4); LOAD_MSG(10, MA1, MA2, MA3, MA4); ROUND(9, MB1, MB2, MB3, MB4); LOAD_MSG(11, MB1, MB2, MB3, MB4); sub $1, RNBLKS; jz .Loop_end; lea 128(RINBLKS), RINBLKS; addq $128, (STATE_T + 0)(RSTATE); adcq $0, (STATE_T + 8)(RSTATE); ROUND(10, MA1, MA2, MA3, MA4); LOAD_MSG(0, MA1, MA2, MA3, MA4); ROUND(11, MB1, MB2, MB3, MB4); LOAD_MSG(1, MB1, MB2, MB3, MB4); vpxor ROW3, ROW1, ROW1; vpxor ROW4, ROW2, ROW2; vmovdqa .Liv+(0 * 8) rRIP, ROW3; vmovdqa .Liv+(4 * 8) rRIP, ROW4; vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1; vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2; vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE); vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE); vpxor (STATE_T)(RSTATE), ROW4, ROW4; jmp .Loop; .Loop_end: ROUND(10, MA1, MA2, MA3, MA4); ROUND(11, MB1, MB2, MB3, MB4); vpxor ROW3, ROW1, ROW1; vpxor ROW4, ROW2, ROW2; vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1; vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2; vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE); vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE); xor %eax, %eax; vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blake2b_transform_amd64_avx2, .-_gcry_blake2b_transform_amd64_avx2;) #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S index 5b936758..5094b4c1 100644 --- a/cipher/blake2s-amd64-avx.S +++ b/cipher/blake2s-amd64-avx.S @@ -1,278 +1,278 @@ /* blake2s-amd64-avx.S - AVX implementation of BLAKE2s * * Copyright (C) 2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* The code is based on public-domain/CC0 BLAKE2 reference implementation * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse * Copyright 2012, Samuel Neves */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_AVX) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #include "asm-common-amd64.h" .text /* register macros */ #define RSTATE %rdi #define RINBLKS %rsi #define RNBLKS %rdx #define RIV %rcx /* state structure */ #define STATE_H 0 #define STATE_T (STATE_H + 8 * 4) #define STATE_F (STATE_T + 2 * 4) /* vector registers */ #define ROW1 %xmm0 #define ROW2 %xmm1 #define ROW3 %xmm2 #define ROW4 %xmm3 #define TMP1 %xmm4 #define TMP1x %xmm4 #define R16 %xmm5 #define R8 %xmm6 #define MA1 %xmm8 #define MA2 %xmm9 #define MA3 %xmm10 #define MA4 %xmm11 #define MB1 %xmm12 #define MB2 %xmm13 #define MB3 %xmm14 #define MB4 %xmm15 /********************************************************************** blake2s/AVX **********************************************************************/ #define GATHER_MSG(m1, m2, m3, m4, \ s0, s1, s2, s3, s4, s5, s6, s7, s8, \ s9, s10, s11, s12, s13, s14, s15) \ vmovd (s0)*4(RINBLKS), m1; \ vmovd (s1)*4(RINBLKS), m2; \ vmovd (s8)*4(RINBLKS), m3; \ vmovd (s9)*4(RINBLKS), m4; \ vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \ vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \ vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \ vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \ vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \ vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \ vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \ vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \ vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \ vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \ vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \ vpinsrd $3, (s15)*4(RINBLKS), m4, m4; #define LOAD_MSG_0(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) #define LOAD_MSG_1(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3) #define LOAD_MSG_2(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4) #define LOAD_MSG_3(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8) #define LOAD_MSG_4(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13) #define LOAD_MSG_5(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9) #define LOAD_MSG_6(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11) #define LOAD_MSG_7(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10) #define LOAD_MSG_8(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5) #define LOAD_MSG_9(m1, m2, m3, m4) \ GATHER_MSG(m1, m2, m3, m4, \ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0) #define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4) #define ROR_16(in, out) vpshufb R16, in, out; #define ROR_8(in, out) vpshufb R8, in, out; #define ROR_12(in, out) \ vpsrld $12, in, TMP1; \ vpslld $(32 - 12), in, out; \ vpxor TMP1, out, out; #define ROR_7(in, out) \ vpsrld $7, in, TMP1; \ vpslld $(32 - 7), in, out; \ vpxor TMP1, out, out; #define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \ vpaddd m, r1, r1; \ vpaddd r2, r1, r1; \ vpxor r1, r4, r4; \ ROR_A(r4, r4); \ vpaddd r4, r3, r3; \ vpxor r3, r2, r2; \ ROR_B(r2, r2); #define G1(r1, r2, r3, r4, m) \ G(r1, r2, r3, r4, m, ROR_16, ROR_12); #define G2(r1, r2, r3, r4, m) \ G(r1, r2, r3, r4, m, ROR_8, ROR_7); #define MM_SHUFFLE(z,y,x,w) \ (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) #define DIAGONALIZE(r1, r2, r3, r4) \ vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \ vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \ vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4; #define UNDIAGONALIZE(r1, r2, r3, r4) \ vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \ vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \ vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4; #define ROUND(r, m1, m2, m3, m4) \ G1(ROW1, ROW2, ROW3, ROW4, m1); \ G2(ROW1, ROW2, ROW3, ROW4, m2); \ DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \ G1(ROW1, ROW2, ROW3, ROW4, m3); \ G2(ROW1, ROW2, ROW3, ROW4, m4); \ UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4); blake2s_data: .align 16 .Liv: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 .Lshuf_ror16: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .Lshuf_ror8: .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12 .align 64 .globl _gcry_blake2s_transform_amd64_avx ELF(.type _gcry_blake2s_transform_amd64_avx,@function;) _gcry_blake2s_transform_amd64_avx: /* input: * %rdi: state * %rsi: blks * %rdx: num_blks */ CFI_STARTPROC(); vzeroupper; addq $64, (STATE_T + 0)(RSTATE); vmovdqa .Lshuf_ror16 rRIP, R16; vmovdqa .Lshuf_ror8 rRIP, R8; vmovdqa .Liv+(0 * 4) rRIP, ROW3; vmovdqa .Liv+(4 * 4) rRIP, ROW4; vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1; vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2; vpxor (STATE_T)(RSTATE), ROW4, ROW4; LOAD_MSG(0, MA1, MA2, MA3, MA4); LOAD_MSG(1, MB1, MB2, MB3, MB4); .Loop: ROUND(0, MA1, MA2, MA3, MA4); LOAD_MSG(2, MA1, MA2, MA3, MA4); ROUND(1, MB1, MB2, MB3, MB4); LOAD_MSG(3, MB1, MB2, MB3, MB4); ROUND(2, MA1, MA2, MA3, MA4); LOAD_MSG(4, MA1, MA2, MA3, MA4); ROUND(3, MB1, MB2, MB3, MB4); LOAD_MSG(5, MB1, MB2, MB3, MB4); ROUND(4, MA1, MA2, MA3, MA4); LOAD_MSG(6, MA1, MA2, MA3, MA4); ROUND(5, MB1, MB2, MB3, MB4); LOAD_MSG(7, MB1, MB2, MB3, MB4); ROUND(6, MA1, MA2, MA3, MA4); LOAD_MSG(8, MA1, MA2, MA3, MA4); ROUND(7, MB1, MB2, MB3, MB4); LOAD_MSG(9, MB1, MB2, MB3, MB4); sub $1, RNBLKS; jz .Loop_end; lea 64(RINBLKS), RINBLKS; addq $64, (STATE_T + 0)(RSTATE); ROUND(8, MA1, MA2, MA3, MA4); LOAD_MSG(0, MA1, MA2, MA3, MA4); ROUND(9, MB1, MB2, MB3, MB4); LOAD_MSG(1, MB1, MB2, MB3, MB4); vpxor ROW3, ROW1, ROW1; vpxor ROW4, ROW2, ROW2; vmovdqa .Liv+(0 * 4) rRIP, ROW3; vmovdqa .Liv+(4 * 4) rRIP, ROW4; vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1; vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2; vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE); vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE); vpxor (STATE_T)(RSTATE), ROW4, ROW4; jmp .Loop; .Loop_end: ROUND(8, MA1, MA2, MA3, MA4); ROUND(9, MB1, MB2, MB3, MB4); vpxor ROW3, ROW1, ROW1; vpxor ROW4, ROW2, ROW2; vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1; vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2; vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE); vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE); xor %eax, %eax; vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blake2s_transform_amd64_avx, .-_gcry_blake2s_transform_amd64_avx;) #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S index bdb361d7..2b4ffa1a 100644 --- a/cipher/blowfish-amd64.S +++ b/cipher/blowfish-amd64.S @@ -1,601 +1,601 @@ /* blowfish-amd64.S - AMD64 assembly implementation of Blowfish cipher * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if defined(USE_BLOWFISH) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #include "asm-common-amd64.h" .text /* structure of BLOWFISH_context: */ #define s0 0 #define s1 ((s0) + 256 * 4) #define s2 ((s1) + 256 * 4) #define s3 ((s2) + 256 * 4) #define p ((s3) + 256 * 4) /* register macros */ #define CTX %rdi #define RIO %rsi #define RX0 %rax #define RX1 %rbx #define RX2 %rcx #define RX3 %rdx #define RX0d %eax #define RX1d %ebx #define RX2d %ecx #define RX3d %edx #define RX0bl %al #define RX1bl %bl #define RX2bl %cl #define RX3bl %dl #define RX0bh %ah #define RX1bh %bh #define RX2bh %ch #define RX3bh %dh #define RT0 %rbp #define RT1 %rsi #define RT2 %r8 #define RT3 %r9 #define RT0d %ebp #define RT1d %esi #define RT2d %r8d #define RT3d %r9d #define RKEY %r10 /*********************************************************************** * 1-way blowfish ***********************************************************************/ #define F() \ movzbl RX0bh, RT1d; \ movzbl RX0bl, RT3d; \ rorq $16, RX0; \ movzbl RX0bh, RT0d; \ movzbl RX0bl, RT2d; \ rorq $16, RX0; \ movl s0(CTX,RT0,4), RT0d; \ addl s1(CTX,RT2,4), RT0d; \ xorl s2(CTX,RT1,4), RT0d; \ addl s3(CTX,RT3,4), RT0d; \ xorq RT0, RX0; #define load_roundkey_enc(n) \ movq p+4*(n)(CTX), RX3; #define add_roundkey_enc() \ xorq RX3, RX0; #define round_enc(n) \ add_roundkey_enc(); \ load_roundkey_enc(n); \ \ F(); \ F(); #define load_roundkey_dec(n) \ movq p+4*(n-1)(CTX), RX3; \ rorq $32, RX3; #define add_roundkey_dec() \ xorq RX3, RX0; #define round_dec(n) \ add_roundkey_dec(); \ load_roundkey_dec(n); \ \ F(); \ F(); #define read_block() \ movq (RIO), RX0; \ rorq $32, RX0; \ bswapq RX0; #define write_block() \ bswapq RX0; \ movq RX0, (RIO); .align 8 ELF(.type __blowfish_enc_blk1,@function;) __blowfish_enc_blk1: /* input: * %rdi: ctx, CTX * RX0: input plaintext block * output: * RX0: output plaintext block */ CFI_STARTPROC(); movq %rbp, %r11; CFI_REGISTER(%rbp, %r11); load_roundkey_enc(0); round_enc(2); round_enc(4); round_enc(6); round_enc(8); round_enc(10); round_enc(12); round_enc(14); round_enc(16); add_roundkey_enc(); movq %r11, %rbp; CFI_RESTORE(%rbp) - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;) .align 8 .globl _gcry_blowfish_amd64_do_encrypt ELF(.type _gcry_blowfish_amd64_do_encrypt,@function;) _gcry_blowfish_amd64_do_encrypt: /* input: * %rdi: ctx, CTX * %rsi: u32 *ret_xl * %rdx: u32 *ret_xr */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 movl (%rdx), RX0d; shlq $32, RX0; movl (%rsi), RT3d; movq %rdx, %r10; orq RT3, RX0; movq %rsi, RX2; call __blowfish_enc_blk1; movl RX0d, (%r10); shrq $32, RX0; movl RX0d, (RX2); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;) .align 8 .globl _gcry_blowfish_amd64_encrypt_block ELF(.type _gcry_blowfish_amd64_encrypt_block,@function;) _gcry_blowfish_amd64_encrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 movq %rsi, %r10; movq %rdx, RIO; read_block(); call __blowfish_enc_blk1; movq %r10, RIO; write_block(); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;) .align 8 .globl _gcry_blowfish_amd64_decrypt_block ELF(.type _gcry_blowfish_amd64_decrypt_block,@function;) _gcry_blowfish_amd64_decrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 movq %rbp, %r11; CFI_REGISTER(%rbp, %r11); movq %rsi, %r10; movq %rdx, RIO; read_block(); load_roundkey_dec(17); round_dec(15); round_dec(13); round_dec(11); round_dec(9); round_dec(7); round_dec(5); round_dec(3); round_dec(1); add_roundkey_dec(); movq %r10, RIO; write_block(); movq %r11, %rbp; CFI_RESTORE(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;) /********************************************************************** 4-way blowfish, four blocks parallel **********************************************************************/ #define F4(x) \ movzbl x ## bh, RT1d; \ movzbl x ## bl, RT3d; \ rorq $16, x; \ movzbl x ## bh, RT0d; \ movzbl x ## bl, RT2d; \ rorq $16, x; \ movl s0(CTX,RT0,4), RT0d; \ addl s1(CTX,RT2,4), RT0d; \ xorl s2(CTX,RT1,4), RT0d; \ addl s3(CTX,RT3,4), RT0d; \ xorq RT0, x; #define add_preloaded_roundkey4() \ xorq RKEY, RX0; \ xorq RKEY, RX1; \ xorq RKEY, RX2; \ xorq RKEY, RX3; #define preload_roundkey_enc(n) \ movq p+4*(n)(CTX), RKEY; #define add_roundkey_enc4(n) \ add_preloaded_roundkey4(); \ preload_roundkey_enc(n + 2); #define round_enc4(n) \ add_roundkey_enc4(n); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); #define preload_roundkey_dec(n) \ movq p+4*((n)-1)(CTX), RKEY; \ rorq $32, RKEY; #define add_roundkey_dec4(n) \ add_preloaded_roundkey4(); \ preload_roundkey_dec(n - 2); #define round_dec4(n) \ add_roundkey_dec4(n); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); #define inbswap_block4() \ rorq $32, RX0; \ bswapq RX0; \ rorq $32, RX1; \ bswapq RX1; \ rorq $32, RX2; \ bswapq RX2; \ rorq $32, RX3; \ bswapq RX3; #define inctrswap_block4() \ rorq $32, RX0; \ rorq $32, RX1; \ rorq $32, RX2; \ rorq $32, RX3; #define outbswap_block4() \ bswapq RX0; \ bswapq RX1; \ bswapq RX2; \ bswapq RX3; .align 8 ELF(.type __blowfish_enc_blk4,@function;) __blowfish_enc_blk4: /* input: * %rdi: ctx, CTX * RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks * output: * RX0,RX1,RX2,RX3: four output ciphertext blocks */ CFI_STARTPROC(); preload_roundkey_enc(0); round_enc4(0); round_enc4(2); round_enc4(4); round_enc4(6); round_enc4(8); round_enc4(10); round_enc4(12); round_enc4(14); add_preloaded_roundkey4(); outbswap_block4(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;) .align 8 ELF(.type __blowfish_dec_blk4,@function;) __blowfish_dec_blk4: /* input: * %rdi: ctx, CTX * RX0,RX1,RX2,RX3: four input ciphertext blocks * output: * RX0,RX1,RX2,RX3: four output plaintext blocks */ CFI_STARTPROC(); preload_roundkey_dec(17); inbswap_block4(); round_dec4(17); round_dec4(15); round_dec4(13); round_dec4(11); round_dec4(9); round_dec4(7); round_dec4(5); round_dec4(3); add_preloaded_roundkey4(); outbswap_block4(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;) .align 8 .globl _gcry_blowfish_amd64_ctr_enc ELF(.type _gcry_blowfish_amd64_ctr_enc,@function;) _gcry_blowfish_amd64_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (big endian, 64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); /* %r11-%r13 are not used by __blowfish_enc_blk4 */ movq %rcx, %r13; /*iv*/ movq %rdx, %r12; /*src*/ movq %rsi, %r11; /*dst*/ /* load IV and byteswap */ movq (%r13), RT0; bswapq RT0; movq RT0, RX0; /* construct IVs */ leaq 1(RT0), RX1; leaq 2(RT0), RX2; leaq 3(RT0), RX3; leaq 4(RT0), RT0; bswapq RT0; inctrswap_block4(); /* store new IV */ movq RT0, (%r13); call __blowfish_enc_blk4; /* XOR key-stream with plaintext */ xorq 0 * 8(%r12), RX0; xorq 1 * 8(%r12), RX1; xorq 2 * 8(%r12), RX2; xorq 3 * 8(%r12), RX3; movq RX0, 0 * 8(%r11); movq RX1, 1 * 8(%r11); movq RX2, 2 * 8(%r11); movq RX3, 3 * 8(%r11); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;) .align 8 .globl _gcry_blowfish_amd64_cbc_dec ELF(.type _gcry_blowfish_amd64_cbc_dec,@function;) _gcry_blowfish_amd64_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); /* %r11-%r13 are not used by __blowfish_dec_blk4 */ movq %rsi, %r11; /*dst*/ movq %rdx, %r12; /*src*/ movq %rcx, %r13; /*iv*/ /* load input */ movq 0 * 8(%r12), RX0; movq 1 * 8(%r12), RX1; movq 2 * 8(%r12), RX2; movq 3 * 8(%r12), RX3; call __blowfish_dec_blk4; movq 3 * 8(%r12), RT0; xorq (%r13), RX0; xorq 0 * 8(%r12), RX1; xorq 1 * 8(%r12), RX2; xorq 2 * 8(%r12), RX3; movq RT0, (%r13); /* store new IV */ movq RX0, 0 * 8(%r11); movq RX1, 1 * 8(%r11); movq RX2, 2 * 8(%r11); movq RX3, 3 * 8(%r11); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;) .align 8 .globl _gcry_blowfish_amd64_cfb_dec ELF(.type _gcry_blowfish_amd64_cfb_dec,@function;) _gcry_blowfish_amd64_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); /* %r11-%r13 are not used by __blowfish_enc_blk4 */ movq %rcx, %r13; /*iv*/ movq %rdx, %r12; /*src*/ movq %rsi, %r11; /*dst*/ /* Load input */ movq (%r13), RX0; movq 0 * 8(%r12), RX1; movq 1 * 8(%r12), RX2; movq 2 * 8(%r12), RX3; inbswap_block4(); /* Update IV */ movq 3 * 8(%r12), RT0; movq RT0, (%r13); call __blowfish_enc_blk4; xorq 0 * 8(%r12), RX0; xorq 1 * 8(%r12), RX1; xorq 2 * 8(%r12), RX2; xorq 3 * 8(%r12), RX3; movq RX0, 0 * 8(%r11); movq RX1, 1 * 8(%r11); movq RX2, 2 * 8(%r11); movq RX3, 3 * 8(%r11); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;) #endif /*defined(USE_BLOWFISH)*/ #endif /*__x86_64*/ diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 64cabaa5..5c304e57 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -1,2618 +1,2618 @@ /* camellia-avx-aesni-amd64.S - AES-NI/AVX implementation of Camellia cipher * * Copyright (C) 2013-2015,2020 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #ifdef __x86_64 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) #include "asm-common-amd64.h" #define CAMELLIA_TABLE_BYTE_LEN 272 /* struct CAMELLIA_context: */ #define key_table 0 #define key_bitlength CAMELLIA_TABLE_BYTE_LEN /* register macros */ #define CTX %rdi /********************************************************************** helper macros **********************************************************************/ #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ vpand x, mask4bit, tmp0; \ vpandn x, mask4bit, x; \ vpsrld $4, x, x; \ \ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; /********************************************************************** 16-way camellia **********************************************************************/ /* * IN: * x0..x7: byte-sliced AB state * mem_cd: register pointer storing CD state * key: index for key material * OUT: * x0..x7: new byte-sliced CD state */ #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ t7, mem_cd, key) \ /* \ * S-function with AES subbytes \ */ \ vmovdqa .Linv_shift_row rRIP, t4; \ vbroadcastss .L0f0f0f0f rRIP, t7; \ vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \ vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ vpshufb t4, x7, x7; \ vpshufb t4, x1, x1; \ vpshufb t4, x4, x4; \ vpshufb t4, x2, x2; \ vpshufb t4, x5, x5; \ vpshufb t4, x3, x3; \ vpshufb t4, x6, x6; \ \ /* prefilter sboxes 1, 2 and 3 */ \ vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \ vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x1, t0, t1, t7, t6); \ filter_8bit(x4, t0, t1, t7, t6); \ filter_8bit(x2, t0, t1, t7, t6); \ filter_8bit(x5, t0, t1, t7, t6); \ \ /* prefilter sbox 4 */ \ vpxor t4, t4, t4; \ filter_8bit(x3, t2, t3, t7, t6); \ filter_8bit(x6, t2, t3, t7, t6); \ \ /* AES subbytes + AES shift rows */ \ vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \ vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \ vaesenclast t4, x0, x0; \ vaesenclast t4, x7, x7; \ vaesenclast t4, x1, x1; \ vaesenclast t4, x4, x4; \ vaesenclast t4, x2, x2; \ vaesenclast t4, x5, x5; \ vaesenclast t4, x3, x3; \ vaesenclast t4, x6, x6; \ \ /* postfilter sboxes 1 and 4 */ \ vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \ vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \ vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ vpxor t6, t6, t6; \ vmovq key, t0; \ \ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ \ vpsrldq $5, t0, t5; \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ vpsrldq $3, t0, t3; \ vpsrldq $4, t0, t4; \ vpshufb t6, t0, t0; \ vpshufb t6, t1, t1; \ vpshufb t6, t2, t2; \ vpshufb t6, t3, t3; \ vpshufb t6, t4, t4; \ vpsrldq $2, t5, t7; \ vpshufb t6, t7, t7; \ \ /* P-function */ \ vpxor x5, x0, x0; \ vpxor x6, x1, x1; \ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ vpxor x6, x3, x3; \ \ vpxor x3, x4, x4; \ vpxor x0, x5, x5; \ vpxor x1, x6, x6; \ vpxor x2, x7, x7; /* note: high and low parts swapped */ \ \ /* Add key material and result to CD (x becomes new CD) */ \ \ vpxor t3, x4, x4; \ vpxor 0 * 16(mem_cd), x4, x4; \ \ vpxor t2, x5, x5; \ vpxor 1 * 16(mem_cd), x5, x5; \ \ vpsrldq $1, t5, t3; \ vpshufb t6, t5, t5; \ vpshufb t6, t3, t6; \ \ vpxor t1, x6, x6; \ vpxor 2 * 16(mem_cd), x6, x6; \ \ vpxor t0, x7, x7; \ vpxor 3 * 16(mem_cd), x7, x7; \ \ vpxor t7, x0, x0; \ vpxor 4 * 16(mem_cd), x0, x0; \ \ vpxor t6, x1, x1; \ vpxor 5 * 16(mem_cd), x1, x1; \ \ vpxor t5, x2, x2; \ vpxor 6 * 16(mem_cd), x2, x2; \ \ vpxor t4, x3, x3; \ vpxor 7 * 16(mem_cd), x3, x3; /* * IN/OUT: * x0..x7: byte-sliced AB state preloaded * mem_ab: byte-sliced AB state in memory * mem_cb: byte-sliced CD state in memory */ #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \ \ vmovdqu x4, 0 * 16(mem_cd); \ vmovdqu x5, 1 * 16(mem_cd); \ vmovdqu x6, 2 * 16(mem_cd); \ vmovdqu x7, 3 * 16(mem_cd); \ vmovdqu x0, 4 * 16(mem_cd); \ vmovdqu x1, 5 * 16(mem_cd); \ vmovdqu x2, 6 * 16(mem_cd); \ vmovdqu x3, 7 * 16(mem_cd); \ \ roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \ \ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ /* Store new AB state */ \ vmovdqu x0, 0 * 16(mem_ab); \ vmovdqu x1, 1 * 16(mem_ab); \ vmovdqu x2, 2 * 16(mem_ab); \ vmovdqu x3, 3 * 16(mem_ab); \ vmovdqu x4, 4 * 16(mem_ab); \ vmovdqu x5, 5 * 16(mem_ab); \ vmovdqu x6, 6 * 16(mem_ab); \ vmovdqu x7, 7 * 16(mem_ab); #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i) \ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i) \ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); /* * IN: * v0..3: byte-sliced 32-bit integers * OUT: * v0..3: (IN <<< 1) */ #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ vpcmpgtb v0, zero, t0; \ vpaddb v0, v0, v0; \ vpabsb t0, t0; \ \ vpcmpgtb v1, zero, t1; \ vpaddb v1, v1, v1; \ vpabsb t1, t1; \ \ vpcmpgtb v2, zero, t2; \ vpaddb v2, v2, v2; \ vpabsb t2, t2; \ \ vpor t0, v1, v1; \ \ vpcmpgtb v3, zero, t0; \ vpaddb v3, v3, v3; \ vpabsb t0, t0; \ \ vpor t1, v2, v2; \ vpor t2, v3, v3; \ vpor t0, v0, v0; /* * IN: * r: byte-sliced AB state in memory * l: byte-sliced CD state in memory * OUT: * x0..x7: new byte-sliced CD state */ #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ tt1, tt2, tt3, kll, klr, krl, krr) \ /* \ * t0 = kll; \ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ vpxor tt0, tt0, tt0; \ vmovd kll, t0; \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpand l0, t0, t0; \ vpand l1, t1, t1; \ vpand l2, t2, t2; \ vpand l3, t3, t3; \ \ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor l4, t0, l4; \ vmovdqu l4, 4 * 16(l); \ vpxor l5, t1, l5; \ vmovdqu l5, 5 * 16(l); \ vpxor l6, t2, l6; \ vmovdqu l6, 6 * 16(l); \ vpxor l7, t3, l7; \ vmovdqu l7, 7 * 16(l); \ \ /* \ * t2 = krr; \ * t2 |= rr; \ * rl ^= t2; \ */ \ \ vmovd krr, t0; \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpor 4 * 16(r), t0, t0; \ vpor 5 * 16(r), t1, t1; \ vpor 6 * 16(r), t2, t2; \ vpor 7 * 16(r), t3, t3; \ \ vpxor 0 * 16(r), t0, t0; \ vpxor 1 * 16(r), t1, t1; \ vpxor 2 * 16(r), t2, t2; \ vpxor 3 * 16(r), t3, t3; \ vmovdqu t0, 0 * 16(r); \ vmovdqu t1, 1 * 16(r); \ vmovdqu t2, 2 * 16(r); \ vmovdqu t3, 3 * 16(r); \ \ /* \ * t2 = krl; \ * t2 &= rl; \ * rr ^= rol32(t2, 1); \ */ \ vmovd krl, t0; \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpand 0 * 16(r), t0, t0; \ vpand 1 * 16(r), t1, t1; \ vpand 2 * 16(r), t2, t2; \ vpand 3 * 16(r), t3, t3; \ \ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor 4 * 16(r), t0, t0; \ vpxor 5 * 16(r), t1, t1; \ vpxor 6 * 16(r), t2, t2; \ vpxor 7 * 16(r), t3, t3; \ vmovdqu t0, 4 * 16(r); \ vmovdqu t1, 5 * 16(r); \ vmovdqu t2, 6 * 16(r); \ vmovdqu t3, 7 * 16(r); \ \ /* \ * t0 = klr; \ * t0 |= lr; \ * ll ^= t0; \ */ \ \ vmovd klr, t0; \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpor l4, t0, t0; \ vpor l5, t1, t1; \ vpor l6, t2, t2; \ vpor l7, t3, t3; \ \ vpxor l0, t0, l0; \ vmovdqu l0, 0 * 16(l); \ vpxor l1, t1, l1; \ vmovdqu l1, 1 * 16(l); \ vpxor l2, t2, l2; \ vmovdqu l2, 2 * 16(l); \ vpxor l3, t3, l3; \ vmovdqu l3, 3 * 16(l); #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ a3, b3, c3, d3, st0, st1) \ vmovdqu d2, st0; \ vmovdqu d3, st1; \ transpose_4x4(a0, a1, a2, a3, d2, d3); \ transpose_4x4(b0, b1, b2, b3, d2, d3); \ vmovdqu st0, d2; \ vmovdqu st1, d3; \ \ vmovdqu a0, st0; \ vmovdqu a1, st1; \ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ vmovdqu .Lshufb_16x16b rRIP, a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ vpshufb a0, b0, b0; \ vpshufb a0, b1, b1; \ vpshufb a0, b2, b2; \ vpshufb a0, b3, b3; \ vpshufb a0, a1, a1; \ vpshufb a0, c0, c0; \ vpshufb a0, c1, c1; \ vpshufb a0, c2, c2; \ vpshufb a0, c3, c3; \ vpshufb a0, d0, d0; \ vpshufb a0, d1, d1; \ vpshufb a0, d2, d2; \ vpshufb a0, d3, d3; \ vmovdqu d3, st1; \ vmovdqu st0, d3; \ vpshufb a0, d3, a0; \ vmovdqu d2, st0; \ \ transpose_4x4(a0, b0, c0, d0, d2, d3); \ transpose_4x4(a1, b1, c1, d1, d2, d3); \ vmovdqu st0, d2; \ vmovdqu st1, d3; \ \ vmovdqu b0, st0; \ vmovdqu b1, st1; \ transpose_4x4(a2, b2, c2, d2, b0, b1); \ transpose_4x4(a3, b3, c3, d3, b0, b1); \ vmovdqu st0, b0; \ vmovdqu st1, b1; \ /* does not adjust output bytes inside vectors */ #define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \ vpunpcklbw a, b, t0; \ vpunpckhbw a, b, b; \ \ vpunpcklbw c, d, t1; \ vpunpckhbw c, d, d; \ \ vpunpcklbw e, f, t2; \ vpunpckhbw e, f, f; \ \ vpunpcklbw g, h, t3; \ vpunpckhbw g, h, h; \ \ vpunpcklwd t0, t1, g; \ vpunpckhwd t0, t1, t0; \ \ vpunpcklwd b, d, t1; \ vpunpckhwd b, d, e; \ \ vpunpcklwd t2, t3, c; \ vpunpckhwd t2, t3, t2; \ \ vpunpcklwd f, h, t3; \ vpunpckhwd f, h, b; \ \ vpunpcklwd e, b, t4; \ vpunpckhwd e, b, b; \ \ vpunpcklwd t1, t3, e; \ vpunpckhwd t1, t3, f; \ \ vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \ \ vpunpcklwd g, c, d; \ vpunpckhwd g, c, c; \ \ vpunpcklwd t0, t2, t1; \ vpunpckhwd t0, t2, h; \ \ vpunpckhqdq b, h, a; \ vpshufb t3, a, a; \ vpunpcklqdq b, h, b; \ vpshufb t3, b, b; \ \ vpunpckhqdq e, d, g; \ vpshufb t3, g, g; \ vpunpcklqdq e, d, h; \ vpshufb t3, h, h; \ \ vpunpckhqdq f, c, e; \ vpshufb t3, e, e; \ vpunpcklqdq f, c, f; \ vpshufb t3, f, f; \ \ vpunpckhqdq t4, t1, c; \ vpshufb t3, c, c; \ vpunpcklqdq t4, t1, d; \ vpshufb t3, d, d; /* load blocks to registers and apply pre-whitening */ #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vmovq key, x0; \ vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor 0 * 16(rio), x0, y7; \ vpxor 1 * 16(rio), x0, y6; \ vpxor 2 * 16(rio), x0, y5; \ vpxor 3 * 16(rio), x0, y4; \ vpxor 4 * 16(rio), x0, y3; \ vpxor 5 * 16(rio), x0, y2; \ vpxor 6 * 16(rio), x0, y1; \ vpxor 7 * 16(rio), x0, y0; \ vpxor 8 * 16(rio), x0, x7; \ vpxor 9 * 16(rio), x0, x6; \ vpxor 10 * 16(rio), x0, x5; \ vpxor 11 * 16(rio), x0, x4; \ vpxor 12 * 16(rio), x0, x3; \ vpxor 13 * 16(rio), x0, x2; \ vpxor 14 * 16(rio), x0, x1; \ vpxor 15 * 16(rio), x0, x0; /* byteslice pre-whitened blocks and store to temporary memory */ #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd) \ byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ y4, y5, y6, y7, (mem_ab), (mem_cd)); \ \ vmovdqu x0, 0 * 16(mem_ab); \ vmovdqu x1, 1 * 16(mem_ab); \ vmovdqu x2, 2 * 16(mem_ab); \ vmovdqu x3, 3 * 16(mem_ab); \ vmovdqu x4, 4 * 16(mem_ab); \ vmovdqu x5, 5 * 16(mem_ab); \ vmovdqu x6, 6 * 16(mem_ab); \ vmovdqu x7, 7 * 16(mem_ab); \ vmovdqu y0, 0 * 16(mem_cd); \ vmovdqu y1, 1 * 16(mem_cd); \ vmovdqu y2, 2 * 16(mem_cd); \ vmovdqu y3, 3 * 16(mem_cd); \ vmovdqu y4, 4 * 16(mem_cd); \ vmovdqu y5, 5 * 16(mem_cd); \ vmovdqu y6, 6 * 16(mem_cd); \ vmovdqu y7, 7 * 16(mem_cd); /* de-byteslice, apply post-whitening and store blocks */ #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ y5, y6, y7, key, stack_tmp0, stack_tmp1) \ byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ \ vmovdqu x0, stack_tmp0; \ \ vmovq key, x0; \ vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ vpxor x0, y5, y5; \ vpxor x0, y4, y4; \ vpxor x0, y3, y3; \ vpxor x0, y2, y2; \ vpxor x0, y1, y1; \ vpxor x0, y0, y0; \ vpxor x0, x7, x7; \ vpxor x0, x6, x6; \ vpxor x0, x5, x5; \ vpxor x0, x4, x4; \ vpxor x0, x3, x3; \ vpxor x0, x2, x2; \ vpxor x0, x1, x1; \ vpxor stack_tmp0, x0, x0; #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio) \ vmovdqu x0, 0 * 16(rio); \ vmovdqu x1, 1 * 16(rio); \ vmovdqu x2, 2 * 16(rio); \ vmovdqu x3, 3 * 16(rio); \ vmovdqu x4, 4 * 16(rio); \ vmovdqu x5, 5 * 16(rio); \ vmovdqu x6, 6 * 16(rio); \ vmovdqu x7, 7 * 16(rio); \ vmovdqu y0, 8 * 16(rio); \ vmovdqu y1, 9 * 16(rio); \ vmovdqu y2, 10 * 16(rio); \ vmovdqu y3, 11 * 16(rio); \ vmovdqu y4, 12 * 16(rio); \ vmovdqu y5, 13 * 16(rio); \ vmovdqu y6, 14 * 16(rio); \ vmovdqu y7, 15 * 16(rio); .text .align 16 #define SHUFB_BYTES(idx) \ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) .Lshufb_16x16b: .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); .Lpack_bswap: .long 0x00010203 .long 0x04050607 .long 0x80808080 .long 0x80808080 /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 /* * pre-SubByte transform * * pre-lookup for sbox1, sbox2, sbox3: * swap_bitendianness( * isom_map_camellia_to_aes( * camellia_f( * swap_bitendianess(in) * ) * ) * ) * * (note: '⊕ 0xc5' inside camellia_f()) */ .Lpre_tf_lo_s1: .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 .Lpre_tf_hi_s1: .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 /* * pre-SubByte transform * * pre-lookup for sbox4: * swap_bitendianness( * isom_map_camellia_to_aes( * camellia_f( * swap_bitendianess(in <<< 1) * ) * ) * ) * * (note: '⊕ 0xc5' inside camellia_f()) */ .Lpre_tf_lo_s4: .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 .Lpre_tf_hi_s4: .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf /* * post-SubByte transform * * post-lookup for sbox1, sbox4: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s1: .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 .Lpost_tf_hi_s1: .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c /* * post-SubByte transform * * post-lookup for sbox2: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) <<< 1 * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s2: .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 .Lpost_tf_hi_s2: .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 /* * post-SubByte transform * * post-lookup for sbox3: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) >>> 1 * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s3: .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 .Lpost_tf_hi_s3: .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 /* For isolating SubBytes from AESENCLAST, inverse shift row */ .Linv_shift_row: .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 /* shuffle mask for 8x8 byte transpose */ .Ltranspose_8x8_shuf: .byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7 .align 4 /* 4-bit mask */ .L0f0f0f0f: .long 0x0f0f0f0f .align 8 ELF(.type __camellia_enc_blk16,@function;) __camellia_enc_blk16: /* input: * %rdi: ctx, CTX * %rax: temporary storage, 256 bytes * %r8d: 24 for 16 byte key, 32 for larger * %xmm0..%xmm15: 16 plaintext blocks * output: * %xmm0..%xmm15: 16 encrypted blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ CFI_STARTPROC(); leaq 8 * 16(%rax), %rcx; leaq (-8 * 8)(CTX, %r8, 8), %r8; inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax, %rcx); .align 8 .Lenc_loop: enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax, %rcx, 0); cmpq %r8, CTX; je .Lenc_done; leaq (8 * 8)(CTX), CTX; fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, ((key_table) + 0)(CTX), ((key_table) + 4)(CTX), ((key_table) + 8)(CTX), ((key_table) + 12)(CTX)); jmp .Lenc_loop; .align 8 .Lenc_done: /* load CD for output */ vmovdqu 0 * 16(%rcx), %xmm8; vmovdqu 1 * 16(%rcx), %xmm9; vmovdqu 2 * 16(%rcx), %xmm10; vmovdqu 3 * 16(%rcx), %xmm11; vmovdqu 4 * 16(%rcx), %xmm12; vmovdqu 5 * 16(%rcx), %xmm13; vmovdqu 6 * 16(%rcx), %xmm14; vmovdqu 7 * 16(%rcx), %xmm15; outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 16(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;) .align 8 ELF(.type __camellia_dec_blk16,@function;) __camellia_dec_blk16: /* input: * %rdi: ctx, CTX * %rax: temporary storage, 256 bytes * %r8d: 24 for 16 byte key, 32 for larger * %xmm0..%xmm15: 16 encrypted blocks * output: * %xmm0..%xmm15: 16 plaintext blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ CFI_STARTPROC(); movq %r8, %rcx; movq CTX, %r8 leaq (-8 * 8)(CTX, %rcx, 8), CTX; leaq 8 * 16(%rax), %rcx; inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax, %rcx); .align 8 .Ldec_loop: dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax, %rcx, 0); cmpq %r8, CTX; je .Ldec_done; fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, ((key_table) + 8)(CTX), ((key_table) + 12)(CTX), ((key_table) + 0)(CTX), ((key_table) + 4)(CTX)); leaq (-8 * 8)(CTX), CTX; jmp .Ldec_loop; .align 8 .Ldec_done: /* load CD for output */ vmovdqu 0 * 16(%rcx), %xmm8; vmovdqu 1 * 16(%rcx), %xmm9; vmovdqu 2 * 16(%rcx), %xmm10; vmovdqu 3 * 16(%rcx), %xmm11; vmovdqu 4 * 16(%rcx), %xmm12; vmovdqu 5 * 16(%rcx), %xmm13; vmovdqu 6 * 16(%rcx), %xmm14; vmovdqu 7 * 16(%rcx), %xmm15; outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; .align 8 .globl _gcry_camellia_aesni_avx_ctr_enc ELF(.type _gcry_camellia_aesni_avx_ctr_enc,@function;) _gcry_camellia_aesni_avx_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 16), %rsp; andq $~31, %rsp; movq %rsp, %rax; vmovdqa .Lbswap128_mask rRIP, %xmm14; /* load IV and byteswap */ vmovdqu (%rcx), %xmm15; vmovdqu %xmm15, 15 * 16(%rax); vpshufb %xmm14, %xmm15, %xmm0; /* be => le */ vpcmpeqd %xmm15, %xmm15, %xmm15; vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ /* construct IVs */ inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm13; vmovdqu %xmm13, 14 * 16(%rax); inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm13; vmovdqu %xmm13, 13 * 16(%rax); inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm12; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm11; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm10; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm9; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm8; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm7; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm6; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm5; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm4; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm3; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm2; inc_le128(%xmm0, %xmm15, %xmm13); vpshufb %xmm14, %xmm0, %xmm1; inc_le128(%xmm0, %xmm15, %xmm13); vmovdqa %xmm0, %xmm13; vpshufb %xmm14, %xmm0, %xmm0; inc_le128(%xmm13, %xmm15, %xmm14); vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */ vmovdqu %xmm13, (%rcx); /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm15; vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; vpxor %xmm3, %xmm15, %xmm3; vpxor %xmm4, %xmm15, %xmm4; vpxor %xmm5, %xmm15, %xmm5; vpxor %xmm6, %xmm15, %xmm6; vpxor %xmm7, %xmm15, %xmm7; vpxor %xmm8, %xmm15, %xmm8; vpxor %xmm9, %xmm15, %xmm9; vpxor %xmm10, %xmm15, %xmm10; vpxor %xmm11, %xmm15, %xmm11; vpxor %xmm12, %xmm15, %xmm12; vpxor 13 * 16(%rax), %xmm15, %xmm13; vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; call __camellia_enc_blk16; vpxor 0 * 16(%rdx), %xmm7, %xmm7; vpxor 1 * 16(%rdx), %xmm6, %xmm6; vpxor 2 * 16(%rdx), %xmm5, %xmm5; vpxor 3 * 16(%rdx), %xmm4, %xmm4; vpxor 4 * 16(%rdx), %xmm3, %xmm3; vpxor 5 * 16(%rdx), %xmm2, %xmm2; vpxor 6 * 16(%rdx), %xmm1, %xmm1; vpxor 7 * 16(%rdx), %xmm0, %xmm0; vpxor 8 * 16(%rdx), %xmm15, %xmm15; vpxor 9 * 16(%rdx), %xmm14, %xmm14; vpxor 10 * 16(%rdx), %xmm13, %xmm13; vpxor 11 * 16(%rdx), %xmm12, %xmm12; vpxor 12 * 16(%rdx), %xmm11, %xmm11; vpxor 13 * 16(%rdx), %xmm10, %xmm10; vpxor 14 * 16(%rdx), %xmm9, %xmm9; vpxor 15 * 16(%rdx), %xmm8, %xmm8; write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); vzeroall; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;) .align 8 .globl _gcry_camellia_aesni_avx_cbc_dec ELF(.type _gcry_camellia_aesni_avx_cbc_dec,@function;) _gcry_camellia_aesni_avx_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; movq %rcx, %r9; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rdx, (key_table)(CTX, %r8, 8)); subq $(16 * 16), %rsp; andq $~31, %rsp; movq %rsp, %rax; call __camellia_dec_blk16; /* XOR output with IV */ vpxor (%r9), %xmm7, %xmm7; vpxor (0 * 16)(%rdx), %xmm6, %xmm6; vpxor (1 * 16)(%rdx), %xmm5, %xmm5; vpxor (2 * 16)(%rdx), %xmm4, %xmm4; vpxor (3 * 16)(%rdx), %xmm3, %xmm3; vpxor (4 * 16)(%rdx), %xmm2, %xmm2; vpxor (5 * 16)(%rdx), %xmm1, %xmm1; vpxor (6 * 16)(%rdx), %xmm0, %xmm0; vpxor (7 * 16)(%rdx), %xmm15, %xmm15; vpxor (8 * 16)(%rdx), %xmm14, %xmm14; vpxor (9 * 16)(%rdx), %xmm13, %xmm13; vpxor (10 * 16)(%rdx), %xmm12, %xmm12; vpxor (11 * 16)(%rdx), %xmm11, %xmm11; vpxor (12 * 16)(%rdx), %xmm10, %xmm10; vpxor (13 * 16)(%rdx), %xmm9, %xmm9; vpxor (14 * 16)(%rdx), %xmm8, %xmm8; movq (15 * 16 + 0)(%rdx), %r10; movq (15 * 16 + 8)(%rdx), %r11; write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); /* store new IV */ movq %r10, (0)(%r9); movq %r11, (8)(%r9); vzeroall; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;) .align 8 .globl _gcry_camellia_aesni_avx_cfb_dec ELF(.type _gcry_camellia_aesni_avx_cfb_dec,@function;) _gcry_camellia_aesni_avx_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 16), %rsp; andq $~31, %rsp; movq %rsp, %rax; /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm0; vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0; vpxor (%rcx), %xmm0, %xmm15; vmovdqu 15 * 16(%rdx), %xmm1; vmovdqu %xmm1, (%rcx); /* store new IV */ vpxor 0 * 16(%rdx), %xmm0, %xmm14; vpxor 1 * 16(%rdx), %xmm0, %xmm13; vpxor 2 * 16(%rdx), %xmm0, %xmm12; vpxor 3 * 16(%rdx), %xmm0, %xmm11; vpxor 4 * 16(%rdx), %xmm0, %xmm10; vpxor 5 * 16(%rdx), %xmm0, %xmm9; vpxor 6 * 16(%rdx), %xmm0, %xmm8; vpxor 7 * 16(%rdx), %xmm0, %xmm7; vpxor 8 * 16(%rdx), %xmm0, %xmm6; vpxor 9 * 16(%rdx), %xmm0, %xmm5; vpxor 10 * 16(%rdx), %xmm0, %xmm4; vpxor 11 * 16(%rdx), %xmm0, %xmm3; vpxor 12 * 16(%rdx), %xmm0, %xmm2; vpxor 13 * 16(%rdx), %xmm0, %xmm1; vpxor 14 * 16(%rdx), %xmm0, %xmm0; call __camellia_enc_blk16; vpxor 0 * 16(%rdx), %xmm7, %xmm7; vpxor 1 * 16(%rdx), %xmm6, %xmm6; vpxor 2 * 16(%rdx), %xmm5, %xmm5; vpxor 3 * 16(%rdx), %xmm4, %xmm4; vpxor 4 * 16(%rdx), %xmm3, %xmm3; vpxor 5 * 16(%rdx), %xmm2, %xmm2; vpxor 6 * 16(%rdx), %xmm1, %xmm1; vpxor 7 * 16(%rdx), %xmm0, %xmm0; vpxor 8 * 16(%rdx), %xmm15, %xmm15; vpxor 9 * 16(%rdx), %xmm14, %xmm14; vpxor 10 * 16(%rdx), %xmm13, %xmm13; vpxor 11 * 16(%rdx), %xmm12, %xmm12; vpxor 12 * 16(%rdx), %xmm11, %xmm11; vpxor 13 * 16(%rdx), %xmm10, %xmm10; vpxor 14 * 16(%rdx), %xmm9, %xmm9; vpxor 15 * 16(%rdx), %xmm8, %xmm8; write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); vzeroall; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;) .align 8 .globl _gcry_camellia_aesni_avx_ocb_enc ELF(.type _gcry_camellia_aesni_avx_ocb_enc,@function;) _gcry_camellia_aesni_avx_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; subq $(16 * 16 + 4 * 8), %rsp; andq $~31, %rsp; movq %rsp, %rax; movq %r10, (16 * 16 + 0 * 8)(%rsp); movq %r11, (16 * 16 + 1 * 8)(%rsp); movq %r12, (16 * 16 + 2 * 8)(%rsp); movq %r13, (16 * 16 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); vmovdqu (%rcx), %xmm14; vmovdqu (%r8), %xmm15; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ vmovdqu (n * 16)(%rdx), xreg; \ vpxor (lreg), %xmm14, %xmm14; \ vpxor xreg, %xmm15, %xmm15; \ vpxor xreg, %xmm14, xreg; \ vmovdqu %xmm14, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %xmm0); vmovdqu %xmm0, (15 * 16)(%rax); OCB_INPUT(1, %r11, %xmm0); vmovdqu %xmm0, (14 * 16)(%rax); OCB_INPUT(2, %r12, %xmm13); OCB_INPUT(3, %r13, %xmm12); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %xmm11); OCB_INPUT(5, %r11, %xmm10); OCB_INPUT(6, %r12, %xmm9); OCB_INPUT(7, %r13, %xmm8); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(8, %r10, %xmm7); OCB_INPUT(9, %r11, %xmm6); OCB_INPUT(10, %r12, %xmm5); OCB_INPUT(11, %r13, %xmm4); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(12, %r10, %xmm3); OCB_INPUT(13, %r11, %xmm2); OCB_INPUT(14, %r12, %xmm1); OCB_INPUT(15, %r13, %xmm0); #undef OCB_INPUT vmovdqu %xmm14, (%rcx); vmovdqu %xmm15, (%r8); cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r10d; cmovel %r10d, %r8d; /* max */ /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm15; vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; vpxor %xmm3, %xmm15, %xmm3; vpxor %xmm4, %xmm15, %xmm4; vpxor %xmm5, %xmm15, %xmm5; vpxor %xmm6, %xmm15, %xmm6; vpxor %xmm7, %xmm15, %xmm7; vpxor %xmm8, %xmm15, %xmm8; vpxor %xmm9, %xmm15, %xmm9; vpxor %xmm10, %xmm15, %xmm10; vpxor %xmm11, %xmm15, %xmm11; vpxor %xmm12, %xmm15, %xmm12; vpxor %xmm13, %xmm15, %xmm13; vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; call __camellia_enc_blk16; vpxor 0 * 16(%rsi), %xmm7, %xmm7; vpxor 1 * 16(%rsi), %xmm6, %xmm6; vpxor 2 * 16(%rsi), %xmm5, %xmm5; vpxor 3 * 16(%rsi), %xmm4, %xmm4; vpxor 4 * 16(%rsi), %xmm3, %xmm3; vpxor 5 * 16(%rsi), %xmm2, %xmm2; vpxor 6 * 16(%rsi), %xmm1, %xmm1; vpxor 7 * 16(%rsi), %xmm0, %xmm0; vpxor 8 * 16(%rsi), %xmm15, %xmm15; vpxor 9 * 16(%rsi), %xmm14, %xmm14; vpxor 10 * 16(%rsi), %xmm13, %xmm13; vpxor 11 * 16(%rsi), %xmm12, %xmm12; vpxor 12 * 16(%rsi), %xmm11, %xmm11; vpxor 13 * 16(%rsi), %xmm10, %xmm10; vpxor 14 * 16(%rsi), %xmm9, %xmm9; vpxor 15 * 16(%rsi), %xmm8, %xmm8; write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); vzeroall; movq (16 * 16 + 0 * 8)(%rsp), %r10; movq (16 * 16 + 1 * 8)(%rsp), %r11; movq (16 * 16 + 2 * 8)(%rsp), %r12; movq (16 * 16 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;) .align 8 .globl _gcry_camellia_aesni_avx_ocb_dec ELF(.type _gcry_camellia_aesni_avx_ocb_dec,@function;) _gcry_camellia_aesni_avx_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; subq $(16 * 16 + 4 * 8), %rsp; andq $~31, %rsp; movq %rsp, %rax; movq %r10, (16 * 16 + 0 * 8)(%rsp); movq %r11, (16 * 16 + 1 * 8)(%rsp); movq %r12, (16 * 16 + 2 * 8)(%rsp); movq %r13, (16 * 16 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); vmovdqu (%rcx), %xmm15; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ vmovdqu (n * 16)(%rdx), xreg; \ vpxor (lreg), %xmm15, %xmm15; \ vpxor xreg, %xmm15, xreg; \ vmovdqu %xmm15, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %xmm0); vmovdqu %xmm0, (15 * 16)(%rax); OCB_INPUT(1, %r11, %xmm14); OCB_INPUT(2, %r12, %xmm13); OCB_INPUT(3, %r13, %xmm12); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %xmm11); OCB_INPUT(5, %r11, %xmm10); OCB_INPUT(6, %r12, %xmm9); OCB_INPUT(7, %r13, %xmm8); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(8, %r10, %xmm7); OCB_INPUT(9, %r11, %xmm6); OCB_INPUT(10, %r12, %xmm5); OCB_INPUT(11, %r13, %xmm4); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(12, %r10, %xmm3); OCB_INPUT(13, %r11, %xmm2); OCB_INPUT(14, %r12, %xmm1); OCB_INPUT(15, %r13, %xmm0); #undef OCB_INPUT vmovdqu %xmm15, (%rcx); movq %r8, %r10; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r9d; cmovel %r9d, %r8d; /* max */ /* inpack16_pre: */ vmovq (key_table)(CTX, %r8, 8), %xmm15; vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; vpxor %xmm3, %xmm15, %xmm3; vpxor %xmm4, %xmm15, %xmm4; vpxor %xmm5, %xmm15, %xmm5; vpxor %xmm6, %xmm15, %xmm6; vpxor %xmm7, %xmm15, %xmm7; vpxor %xmm8, %xmm15, %xmm8; vpxor %xmm9, %xmm15, %xmm9; vpxor %xmm10, %xmm15, %xmm10; vpxor %xmm11, %xmm15, %xmm11; vpxor %xmm12, %xmm15, %xmm12; vpxor %xmm13, %xmm15, %xmm13; vpxor %xmm14, %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; call __camellia_dec_blk16; vpxor 0 * 16(%rsi), %xmm7, %xmm7; vpxor 1 * 16(%rsi), %xmm6, %xmm6; vpxor 2 * 16(%rsi), %xmm5, %xmm5; vpxor 3 * 16(%rsi), %xmm4, %xmm4; vpxor 4 * 16(%rsi), %xmm3, %xmm3; vpxor 5 * 16(%rsi), %xmm2, %xmm2; vpxor 6 * 16(%rsi), %xmm1, %xmm1; vpxor 7 * 16(%rsi), %xmm0, %xmm0; vmovdqu %xmm7, (7 * 16)(%rax); vpxor 8 * 16(%rsi), %xmm15, %xmm15; vpxor 9 * 16(%rsi), %xmm14, %xmm14; vpxor 10 * 16(%rsi), %xmm13, %xmm13; vpxor 11 * 16(%rsi), %xmm12, %xmm12; vpxor 12 * 16(%rsi), %xmm11, %xmm11; vpxor 13 * 16(%rsi), %xmm10, %xmm10; vpxor 14 * 16(%rsi), %xmm9, %xmm9; vpxor 15 * 16(%rsi), %xmm8, %xmm8; /* Checksum_i = Checksum_{i-1} xor P_i */ vpxor (%r10), %xmm7, %xmm7; vpxor %xmm6, %xmm7, %xmm7; vpxor %xmm5, %xmm7, %xmm7; vpxor %xmm4, %xmm7, %xmm7; vpxor %xmm3, %xmm7, %xmm7; vpxor %xmm2, %xmm7, %xmm7; vpxor %xmm1, %xmm7, %xmm7; vpxor %xmm0, %xmm7, %xmm7; vpxor %xmm15, %xmm7, %xmm7; vpxor %xmm14, %xmm7, %xmm7; vpxor %xmm13, %xmm7, %xmm7; vpxor %xmm12, %xmm7, %xmm7; vpxor %xmm11, %xmm7, %xmm7; vpxor %xmm10, %xmm7, %xmm7; vpxor %xmm9, %xmm7, %xmm7; vpxor %xmm8, %xmm7, %xmm7; vmovdqu %xmm7, (%r10); vmovdqu (7 * 16)(%rax), %xmm7; write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, %xmm8, %rsi); vzeroall; movq (16 * 16 + 0 * 8)(%rsp), %r10; movq (16 * 16 + 1 * 8)(%rsp), %r11; movq (16 * 16 + 2 * 8)(%rsp), %r12; movq (16 * 16 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;) .align 8 .globl _gcry_camellia_aesni_avx_ocb_auth ELF(.type _gcry_camellia_aesni_avx_ocb_auth,@function;) _gcry_camellia_aesni_avx_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (16 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; subq $(16 * 16 + 4 * 8), %rsp; andq $~31, %rsp; movq %rsp, %rax; movq %r10, (16 * 16 + 0 * 8)(%rsp); movq %r11, (16 * 16 + 1 * 8)(%rsp); movq %r12, (16 * 16 + 2 * 8)(%rsp); movq %r13, (16 * 16 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); vmovdqu (%rdx), %xmm15; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ vmovdqu (n * 16)(%rsi), xreg; \ vpxor (lreg), %xmm15, %xmm15; \ vpxor xreg, %xmm15, xreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, %xmm0); vmovdqu %xmm0, (15 * 16)(%rax); OCB_INPUT(1, %r11, %xmm14); OCB_INPUT(2, %r12, %xmm13); OCB_INPUT(3, %r13, %xmm12); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(4, %r10, %xmm11); OCB_INPUT(5, %r11, %xmm10); OCB_INPUT(6, %r12, %xmm9); OCB_INPUT(7, %r13, %xmm8); movq (8 * 8)(%r8), %r10; movq (9 * 8)(%r8), %r11; movq (10 * 8)(%r8), %r12; movq (11 * 8)(%r8), %r13; OCB_INPUT(8, %r10, %xmm7); OCB_INPUT(9, %r11, %xmm6); OCB_INPUT(10, %r12, %xmm5); OCB_INPUT(11, %r13, %xmm4); movq (12 * 8)(%r8), %r10; movq (13 * 8)(%r8), %r11; movq (14 * 8)(%r8), %r12; movq (15 * 8)(%r8), %r13; OCB_INPUT(12, %r10, %xmm3); OCB_INPUT(13, %r11, %xmm2); OCB_INPUT(14, %r12, %xmm1); OCB_INPUT(15, %r13, %xmm0); #undef OCB_INPUT cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r10d; cmovel %r10d, %r8d; /* max */ vmovdqu %xmm15, (%rdx); movq %rcx, %r10; /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm15; vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; vpxor %xmm3, %xmm15, %xmm3; vpxor %xmm4, %xmm15, %xmm4; vpxor %xmm5, %xmm15, %xmm5; vpxor %xmm6, %xmm15, %xmm6; vpxor %xmm7, %xmm15, %xmm7; vpxor %xmm8, %xmm15, %xmm8; vpxor %xmm9, %xmm15, %xmm9; vpxor %xmm10, %xmm15, %xmm10; vpxor %xmm11, %xmm15, %xmm11; vpxor %xmm12, %xmm15, %xmm12; vpxor %xmm13, %xmm15, %xmm13; vpxor %xmm14, %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; call __camellia_enc_blk16; vpxor %xmm7, %xmm6, %xmm6; vpxor %xmm5, %xmm4, %xmm4; vpxor %xmm3, %xmm2, %xmm2; vpxor %xmm1, %xmm0, %xmm0; vpxor %xmm15, %xmm14, %xmm14; vpxor %xmm13, %xmm12, %xmm12; vpxor %xmm11, %xmm10, %xmm10; vpxor %xmm9, %xmm8, %xmm8; vpxor %xmm6, %xmm4, %xmm4; vpxor %xmm2, %xmm0, %xmm0; vpxor %xmm14, %xmm12, %xmm12; vpxor %xmm10, %xmm8, %xmm8; vpxor %xmm4, %xmm0, %xmm0; vpxor %xmm12, %xmm8, %xmm8; vpxor %xmm0, %xmm8, %xmm0; vpxor (%r10), %xmm0, %xmm0; vmovdqu %xmm0, (%r10); vzeroall; movq (16 * 16 + 0 * 8)(%rsp), %r10; movq (16 * 16 + 1 * 8)(%rsp), %r11; movq (16 * 16 + 2 * 8)(%rsp), %r12; movq (16 * 16 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;) /* * IN: * ab: 64-bit AB state * cd: 64-bit CD state */ #define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \ _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \ vmovq key, t0; \ vpxor x, x, t3; \ \ vpxor ab, t0, x; \ \ /* \ * S-function with AES subbytes \ */ \ \ /* input rotation for sbox4 (<<< 1) */ \ vpand x, sbox4mask, t0; \ vpandn x, sbox4mask, x; \ vpaddw t0, t0, t1; \ vpsrlw $7, t0, t0; \ vpor t0, t1, t0; \ vpand sbox4mask, t0, t0; \ vpor t0, x, x; \ \ vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \ vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \ \ /* prefilter sboxes */ \ filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \ \ /* AES subbytes + AES shift rows + AES inv shift rows */ \ vaesenclast t3, x, x; \ \ /* postfilter sboxes */ \ filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \ \ /* output rotation for sbox2 (<<< 1) */ \ /* output rotation for sbox3 (>>> 1) */ \ vpshufb inv_shift_row, x, t1; \ vpshufb .Lsp0044440444044404mask rRIP, x, t4; \ vpshufb .Lsp1110111010011110mask rRIP, x, x; \ vpaddb t1, t1, t2; \ vpsrlw $7, t1, t0; \ vpsllw $7, t1, t3; \ vpor t0, t2, t0; \ vpsrlw $1, t1, t1; \ vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \ vpor t1, t3, t1; \ \ vpxor x, t4, t4; \ vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \ vpxor t4, t0, t0; \ vpxor t1, t0, t0; \ vpsrldq $8, t0, x; \ vpxor t0, x, x; #define vec_rol128(in, out, nrol, t0) \ vpshufd $0x4e, in, out; \ vpsllq $(nrol), in, t0; \ vpsrlq $(64-(nrol)), out, out; \ vpaddd t0, out, out; #define vec_ror128(in, out, nror, t0) \ vpshufd $0x4e, in, out; \ vpsrlq $(nror), in, t0; \ vpsllq $(64-(nror)), out, out; \ vpaddd t0, out, out; .align 16 .Linv_shift_row_and_unpcklbw: .byte 0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff .byte 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff .Lsp0044440444044404mask: .long 0xffff0404, 0x0404ff04; .long 0x0d0dff0d, 0x0d0dff0d; .Lsp1110111010011110mask: .long 0x000000ff, 0x000000ff; .long 0x0bffff0b, 0x0b0b0bff; .Lsp0222022222000222mask: .long 0xff060606, 0xff060606; .long 0x0c0cffff, 0xff0c0c0c; .Lsp3033303303303033mask: .long 0x04ff0404, 0x04ff0404; .long 0xff0a0aff, 0x0aff0a0a; .Lsbox4_input_mask: .byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00; .Lsigma1: .long 0x3BCC908B, 0xA09E667F; .Lsigma2: .long 0x4CAA73B2, 0xB67AE858; .Lsigma3: .long 0xE94F82BE, 0xC6EF372F; .Lsigma4: .long 0xF1D36F1C, 0x54FF53A5; .Lsigma5: .long 0xDE682D1D, 0x10E527FA; .Lsigma6: .long 0xB3E6C1FD, 0xB05688C2; .align 8 ELF(.type __camellia_avx_setup128,@function;) __camellia_avx_setup128: /* input: * %rdi: ctx, CTX; subkey storage at key_table(CTX) * %xmm0: key */ CFI_STARTPROC(); #define cmll_sub(n, ctx) (key_table+((n)*8))(ctx) #define KL128 %xmm0 #define KA128 %xmm2 vpshufb .Lbswap128_mask rRIP, KL128, KL128; vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11; vmovq .Lsbox4_input_mask rRIP, %xmm12; vbroadcastss .L0f0f0f0f rRIP, %xmm13; vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14; vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15; /* * Generate KA */ vpsrldq $8, KL128, %xmm2; vmovdqa KL128, %xmm3; vpslldq $8, %xmm3, %xmm3; vpsrldq $8, %xmm3, %xmm3; camellia_f(%xmm2, %xmm4, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP); vpxor %xmm4, %xmm3, %xmm3; camellia_f(%xmm3, %xmm2, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP); camellia_f(%xmm2, %xmm3, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP); vpxor %xmm4, %xmm3, %xmm3; camellia_f(%xmm3, %xmm4, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP); vpslldq $8, %xmm3, %xmm3; vpxor %xmm4, %xmm2, %xmm2; vpsrldq $8, %xmm3, %xmm3; vpslldq $8, %xmm2, KA128; vpor %xmm3, KA128, KA128; /* * Generate subkeys */ vmovdqu KA128, cmll_sub(24, CTX); vec_rol128(KL128, %xmm3, 15, %xmm15); vec_rol128(KA128, %xmm4, 15, %xmm15); vec_rol128(KA128, %xmm5, 30, %xmm15); vec_rol128(KL128, %xmm6, 45, %xmm15); vec_rol128(KA128, %xmm7, 45, %xmm15); vec_rol128(KL128, %xmm8, 60, %xmm15); vec_rol128(KA128, %xmm9, 60, %xmm15); vec_ror128(KL128, %xmm10, 128-77, %xmm15); /* absorb kw2 to other subkeys */ vpslldq $8, KL128, %xmm15; vpsrldq $8, %xmm15, %xmm15; vpxor %xmm15, KA128, KA128; vpxor %xmm15, %xmm3, %xmm3; vpxor %xmm15, %xmm4, %xmm4; /* subl(1) ^= subr(1) & ~subr(9); */ vpandn %xmm15, %xmm5, %xmm13; vpslldq $12, %xmm13, %xmm13; vpsrldq $8, %xmm13, %xmm13; vpxor %xmm13, %xmm15, %xmm15; /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm5, %xmm14; vpslld $1, %xmm14, %xmm11; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm11, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpxor %xmm15, %xmm6, %xmm6; vpxor %xmm15, %xmm8, %xmm8; vpxor %xmm15, %xmm9, %xmm9; /* subl(1) ^= subr(1) & ~subr(17); */ vpandn %xmm15, %xmm10, %xmm13; vpslldq $12, %xmm13, %xmm13; vpsrldq $8, %xmm13, %xmm13; vpxor %xmm13, %xmm15, %xmm15; /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm10, %xmm14; vpslld $1, %xmm14, %xmm11; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm11, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpshufd $0x1b, KL128, KL128; vpshufd $0x1b, KA128, KA128; vpshufd $0x1b, %xmm3, %xmm3; vpshufd $0x1b, %xmm4, %xmm4; vpshufd $0x1b, %xmm5, %xmm5; vpshufd $0x1b, %xmm6, %xmm6; vpshufd $0x1b, %xmm7, %xmm7; vpshufd $0x1b, %xmm8, %xmm8; vpshufd $0x1b, %xmm9, %xmm9; vpshufd $0x1b, %xmm10, %xmm10; vmovdqu KL128, cmll_sub(0, CTX); vpshufd $0x1b, KL128, KL128; vmovdqu KA128, cmll_sub(2, CTX); vmovdqu %xmm3, cmll_sub(4, CTX); vmovdqu %xmm4, cmll_sub(6, CTX); vmovdqu %xmm5, cmll_sub(8, CTX); vmovdqu %xmm6, cmll_sub(10, CTX); vpsrldq $8, %xmm8, %xmm8; vmovq %xmm7, cmll_sub(12, CTX); vmovq %xmm8, cmll_sub(13, CTX); vmovdqu %xmm9, cmll_sub(14, CTX); vmovdqu %xmm10, cmll_sub(16, CTX); vmovdqu cmll_sub(24, CTX), KA128; vec_ror128(KL128, %xmm3, 128 - 94, %xmm7); vec_ror128(KA128, %xmm4, 128 - 94, %xmm7); vec_ror128(KL128, %xmm5, 128 - 111, %xmm7); vec_ror128(KA128, %xmm6, 128 - 111, %xmm7); vpxor %xmm15, %xmm3, %xmm3; vpxor %xmm15, %xmm4, %xmm4; vpxor %xmm15, %xmm5, %xmm5; vpslldq $8, %xmm15, %xmm15; vpxor %xmm15, %xmm6, %xmm6; /* absorb kw4 to other subkeys */ vpslldq $8, %xmm6, %xmm15; vpxor %xmm15, %xmm5, %xmm5; vpxor %xmm15, %xmm4, %xmm4; vpxor %xmm15, %xmm3, %xmm3; /* subl(25) ^= subr(25) & ~subr(16); */ vpshufd $0x1b, cmll_sub(16, CTX), %xmm10; vpandn %xmm15, %xmm10, %xmm13; vpslldq $4, %xmm13, %xmm13; vpxor %xmm13, %xmm15, %xmm15; /* dw = subl(25) & subl(16), subr(25) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm10, %xmm14; vpslld $1, %xmm14, %xmm11; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm11, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpshufd $0x1b, %xmm3, %xmm3; vpshufd $0x1b, %xmm4, %xmm4; vpshufd $0x1b, %xmm5, %xmm5; vpshufd $0x1b, %xmm6, %xmm6; vmovdqu %xmm3, cmll_sub(18, CTX); vmovdqu %xmm4, cmll_sub(20, CTX); vmovdqu %xmm5, cmll_sub(22, CTX); vmovdqu %xmm6, cmll_sub(24, CTX); vpshufd $0x1b, cmll_sub(14, CTX), %xmm3; vpshufd $0x1b, cmll_sub(12, CTX), %xmm4; vpshufd $0x1b, cmll_sub(10, CTX), %xmm5; vpshufd $0x1b, cmll_sub(8, CTX), %xmm6; vpxor %xmm15, %xmm3, %xmm3; vpxor %xmm15, %xmm4, %xmm4; vpxor %xmm15, %xmm5, %xmm5; /* subl(25) ^= subr(25) & ~subr(8); */ vpandn %xmm15, %xmm6, %xmm13; vpslldq $4, %xmm13, %xmm13; vpxor %xmm13, %xmm15, %xmm15; /* dw = subl(25) & subl(8), subr(25) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm6, %xmm14; vpslld $1, %xmm14, %xmm11; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm11, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpshufd $0x1b, %xmm3, %xmm3; vpshufd $0x1b, %xmm4, %xmm4; vpshufd $0x1b, %xmm5, %xmm5; vmovdqu %xmm3, cmll_sub(14, CTX); vmovdqu %xmm4, cmll_sub(12, CTX); vmovdqu %xmm5, cmll_sub(10, CTX); vpshufd $0x1b, cmll_sub(6, CTX), %xmm6; vpshufd $0x1b, cmll_sub(4, CTX), %xmm4; vpshufd $0x1b, cmll_sub(2, CTX), %xmm2; vpshufd $0x1b, cmll_sub(0, CTX), %xmm0; vpxor %xmm15, %xmm6, %xmm6; vpxor %xmm15, %xmm4, %xmm4; vpxor %xmm15, %xmm2, %xmm2; vpxor %xmm15, %xmm0, %xmm0; vpshufd $0x1b, %xmm6, %xmm6; vpshufd $0x1b, %xmm4, %xmm4; vpshufd $0x1b, %xmm2, %xmm2; vpshufd $0x1b, %xmm0, %xmm0; vpsrldq $8, %xmm2, %xmm3; vpsrldq $8, %xmm4, %xmm5; vpsrldq $8, %xmm6, %xmm7; /* * key XOR is end of F-function. */ vpxor %xmm2, %xmm0, %xmm0; vpxor %xmm4, %xmm2, %xmm2; vmovq %xmm0, cmll_sub(0, CTX); vmovq %xmm3, cmll_sub(2, CTX); vpxor %xmm5, %xmm3, %xmm3; vpxor %xmm6, %xmm4, %xmm4; vpxor %xmm7, %xmm5, %xmm5; vmovq %xmm2, cmll_sub(3, CTX); vmovq %xmm3, cmll_sub(4, CTX); vmovq %xmm4, cmll_sub(5, CTX); vmovq %xmm5, cmll_sub(6, CTX); vmovq cmll_sub(7, CTX), %xmm7; vmovq cmll_sub(8, CTX), %xmm8; vmovq cmll_sub(9, CTX), %xmm9; vmovq cmll_sub(10, CTX), %xmm10; /* tl = subl(10) ^ (subr(10) & ~subr(8)); */ vpandn %xmm10, %xmm8, %xmm15; vpsrldq $4, %xmm15, %xmm15; vpxor %xmm15, %xmm10, %xmm0; /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */ vpand %xmm8, %xmm0, %xmm15; vpslld $1, %xmm15, %xmm14; vpsrld $31, %xmm15, %xmm15; vpaddd %xmm14, %xmm15, %xmm15; vpslldq $12, %xmm15, %xmm15; vpsrldq $8, %xmm15, %xmm15; vpxor %xmm15, %xmm0, %xmm0; vpxor %xmm0, %xmm6, %xmm6; vmovq %xmm6, cmll_sub(7, CTX); vmovq cmll_sub(11, CTX), %xmm11; vmovq cmll_sub(12, CTX), %xmm12; vmovq cmll_sub(13, CTX), %xmm13; vmovq cmll_sub(14, CTX), %xmm14; vmovq cmll_sub(15, CTX), %xmm15; /* tl = subl(7) ^ (subr(7) & ~subr(9)); */ vpandn %xmm7, %xmm9, %xmm1; vpsrldq $4, %xmm1, %xmm1; vpxor %xmm1, %xmm7, %xmm0; /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */ vpand %xmm9, %xmm0, %xmm1; vpslld $1, %xmm1, %xmm2; vpsrld $31, %xmm1, %xmm1; vpaddd %xmm2, %xmm1, %xmm1; vpslldq $12, %xmm1, %xmm1; vpsrldq $8, %xmm1, %xmm1; vpxor %xmm1, %xmm0, %xmm0; vpxor %xmm11, %xmm0, %xmm0; vpxor %xmm12, %xmm10, %xmm10; vpxor %xmm13, %xmm11, %xmm11; vpxor %xmm14, %xmm12, %xmm12; vpxor %xmm15, %xmm13, %xmm13; vmovq %xmm0, cmll_sub(10, CTX); vmovq %xmm10, cmll_sub(11, CTX); vmovq %xmm11, cmll_sub(12, CTX); vmovq %xmm12, cmll_sub(13, CTX); vmovq %xmm13, cmll_sub(14, CTX); vmovq cmll_sub(16, CTX), %xmm6; vmovq cmll_sub(17, CTX), %xmm7; vmovq cmll_sub(18, CTX), %xmm8; vmovq cmll_sub(19, CTX), %xmm9; vmovq cmll_sub(20, CTX), %xmm10; /* tl = subl(18) ^ (subr(18) & ~subr(16)); */ vpandn %xmm8, %xmm6, %xmm1; vpsrldq $4, %xmm1, %xmm1; vpxor %xmm1, %xmm8, %xmm0; /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */ vpand %xmm6, %xmm0, %xmm1; vpslld $1, %xmm1, %xmm2; vpsrld $31, %xmm1, %xmm1; vpaddd %xmm2, %xmm1, %xmm1; vpslldq $12, %xmm1, %xmm1; vpsrldq $8, %xmm1, %xmm1; vpxor %xmm1, %xmm0, %xmm0; vpxor %xmm14, %xmm0, %xmm0; vmovq %xmm0, cmll_sub(15, CTX); /* tl = subl(15) ^ (subr(15) & ~subr(17)); */ vpandn %xmm15, %xmm7, %xmm1; vpsrldq $4, %xmm1, %xmm1; vpxor %xmm1, %xmm15, %xmm0; /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */ vpand %xmm7, %xmm0, %xmm1; vpslld $1, %xmm1, %xmm2; vpsrld $31, %xmm1, %xmm1; vpaddd %xmm2, %xmm1, %xmm1; vpslldq $12, %xmm1, %xmm1; vpsrldq $8, %xmm1, %xmm1; vpxor %xmm1, %xmm0, %xmm0; vmovq cmll_sub(21, CTX), %xmm1; vmovq cmll_sub(22, CTX), %xmm2; vmovq cmll_sub(23, CTX), %xmm3; vmovq cmll_sub(24, CTX), %xmm4; vpxor %xmm9, %xmm0, %xmm0; vpxor %xmm10, %xmm8, %xmm8; vpxor %xmm1, %xmm9, %xmm9; vpxor %xmm2, %xmm10, %xmm10; vpxor %xmm3, %xmm1, %xmm1; vpxor %xmm4, %xmm3, %xmm3; vmovq %xmm0, cmll_sub(18, CTX); vmovq %xmm8, cmll_sub(19, CTX); vmovq %xmm9, cmll_sub(20, CTX); vmovq %xmm10, cmll_sub(21, CTX); vmovq %xmm1, cmll_sub(22, CTX); vmovq %xmm2, cmll_sub(23, CTX); vmovq %xmm3, cmll_sub(24, CTX); /* kw2 and kw4 are unused now. */ movq $0, cmll_sub(1, CTX); movq $0, cmll_sub(25, CTX); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;) .align 8 ELF(.type __camellia_avx_setup256,@function;) __camellia_avx_setup256: /* input: * %rdi: ctx, CTX; subkey storage at key_table(CTX) * %xmm0 & %xmm1: key */ CFI_STARTPROC(); #define KL128 %xmm0 #define KR128 %xmm1 #define KA128 %xmm2 #define KB128 %xmm3 vpshufb .Lbswap128_mask rRIP, KL128, KL128; vpshufb .Lbswap128_mask rRIP, KR128, KR128; vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11; vmovq .Lsbox4_input_mask rRIP, %xmm12; vbroadcastss .L0f0f0f0f rRIP, %xmm13; vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14; vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15; /* * Generate KA */ vpxor KL128, KR128, %xmm3; vpsrldq $8, KR128, %xmm6; vpsrldq $8, %xmm3, %xmm2; vpslldq $8, %xmm3, %xmm3; vpsrldq $8, %xmm3, %xmm3; camellia_f(%xmm2, %xmm4, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP); vpxor %xmm4, %xmm3, %xmm3; camellia_f(%xmm3, %xmm2, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP); vpxor %xmm6, %xmm2, %xmm2; camellia_f(%xmm2, %xmm3, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP); vpxor %xmm4, %xmm3, %xmm3; vpxor KR128, %xmm3, %xmm3; camellia_f(%xmm3, %xmm4, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP); vpslldq $8, %xmm3, %xmm3; vpxor %xmm4, %xmm2, %xmm2; vpsrldq $8, %xmm3, %xmm3; vpslldq $8, %xmm2, KA128; vpor %xmm3, KA128, KA128; /* * Generate KB */ vpxor KA128, KR128, %xmm3; vpsrldq $8, %xmm3, %xmm4; vpslldq $8, %xmm3, %xmm3; vpsrldq $8, %xmm3, %xmm3; camellia_f(%xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP); vpxor %xmm5, %xmm3, %xmm3; camellia_f(%xmm3, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP); vpslldq $8, %xmm3, %xmm3; vpxor %xmm5, %xmm4, %xmm4; vpsrldq $8, %xmm3, %xmm3; vpslldq $8, %xmm4, %xmm4; vpor %xmm3, %xmm4, KB128; /* * Generate subkeys */ vmovdqu KB128, cmll_sub(32, CTX); vec_rol128(KR128, %xmm4, 15, %xmm15); vec_rol128(KA128, %xmm5, 15, %xmm15); vec_rol128(KR128, %xmm6, 30, %xmm15); vec_rol128(KB128, %xmm7, 30, %xmm15); vec_rol128(KL128, %xmm8, 45, %xmm15); vec_rol128(KA128, %xmm9, 45, %xmm15); vec_rol128(KL128, %xmm10, 60, %xmm15); vec_rol128(KR128, %xmm11, 60, %xmm15); vec_rol128(KB128, %xmm12, 60, %xmm15); /* absorb kw2 to other subkeys */ vpslldq $8, KL128, %xmm15; vpsrldq $8, %xmm15, %xmm15; vpxor %xmm15, KB128, KB128; vpxor %xmm15, %xmm4, %xmm4; vpxor %xmm15, %xmm5, %xmm5; /* subl(1) ^= subr(1) & ~subr(9); */ vpandn %xmm15, %xmm6, %xmm13; vpslldq $12, %xmm13, %xmm13; vpsrldq $8, %xmm13, %xmm13; vpxor %xmm13, %xmm15, %xmm15; /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm6, %xmm14; vpslld $1, %xmm14, %xmm13; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm13, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpxor %xmm15, %xmm7, %xmm7; vpxor %xmm15, %xmm8, %xmm8; vpxor %xmm15, %xmm9, %xmm9; vpshufd $0x1b, KL128, KL128; vpshufd $0x1b, KB128, KB128; vpshufd $0x1b, %xmm4, %xmm4; vpshufd $0x1b, %xmm5, %xmm5; vpshufd $0x1b, %xmm6, %xmm6; vpshufd $0x1b, %xmm7, %xmm7; vpshufd $0x1b, %xmm8, %xmm8; vpshufd $0x1b, %xmm9, %xmm9; vmovdqu KL128, cmll_sub(0, CTX); vpshufd $0x1b, KL128, KL128; vmovdqu KB128, cmll_sub(2, CTX); vmovdqu %xmm4, cmll_sub(4, CTX); vmovdqu %xmm5, cmll_sub(6, CTX); vmovdqu %xmm6, cmll_sub(8, CTX); vmovdqu %xmm7, cmll_sub(10, CTX); vmovdqu %xmm8, cmll_sub(12, CTX); vmovdqu %xmm9, cmll_sub(14, CTX); vmovdqu cmll_sub(32, CTX), KB128; /* subl(1) ^= subr(1) & ~subr(17); */ vpandn %xmm15, %xmm10, %xmm13; vpslldq $12, %xmm13, %xmm13; vpsrldq $8, %xmm13, %xmm13; vpxor %xmm13, %xmm15, %xmm15; /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm10, %xmm14; vpslld $1, %xmm14, %xmm13; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm13, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpxor %xmm15, %xmm11, %xmm11; vpxor %xmm15, %xmm12, %xmm12; vec_ror128(KL128, %xmm4, 128-77, %xmm14); vec_ror128(KA128, %xmm5, 128-77, %xmm14); vec_ror128(KR128, %xmm6, 128-94, %xmm14); vec_ror128(KA128, %xmm7, 128-94, %xmm14); vec_ror128(KL128, %xmm8, 128-111, %xmm14); vec_ror128(KB128, %xmm9, 128-111, %xmm14); vpxor %xmm15, %xmm4, %xmm4; vpshufd $0x1b, %xmm10, %xmm10; vpshufd $0x1b, %xmm11, %xmm11; vpshufd $0x1b, %xmm12, %xmm12; vpshufd $0x1b, %xmm4, %xmm4; vmovdqu %xmm10, cmll_sub(16, CTX); vmovdqu %xmm11, cmll_sub(18, CTX); vmovdqu %xmm12, cmll_sub(20, CTX); vmovdqu %xmm4, cmll_sub(22, CTX); /* subl(1) ^= subr(1) & ~subr(25); */ vpandn %xmm15, %xmm5, %xmm13; vpslldq $12, %xmm13, %xmm13; vpsrldq $8, %xmm13, %xmm13; vpxor %xmm13, %xmm15, %xmm15; /* dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm5, %xmm14; vpslld $1, %xmm14, %xmm13; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm13, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpxor %xmm15, %xmm6, %xmm6; vpxor %xmm15, %xmm7, %xmm7; vpxor %xmm15, %xmm8, %xmm8; vpslldq $8, %xmm15, %xmm15; vpxor %xmm15, %xmm9, %xmm9; /* absorb kw4 to other subkeys */ vpslldq $8, %xmm9, %xmm15; vpxor %xmm15, %xmm8, %xmm8; vpxor %xmm15, %xmm7, %xmm7; vpxor %xmm15, %xmm6, %xmm6; /* subl(33) ^= subr(33) & ~subr(24); */ vpandn %xmm15, %xmm5, %xmm14; vpslldq $4, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm5, %xmm14; vpslld $1, %xmm14, %xmm13; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm13, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpshufd $0x1b, %xmm5, %xmm5; vpshufd $0x1b, %xmm6, %xmm6; vpshufd $0x1b, %xmm7, %xmm7; vpshufd $0x1b, %xmm8, %xmm8; vpshufd $0x1b, %xmm9, %xmm9; vmovdqu %xmm5, cmll_sub(24, CTX); vmovdqu %xmm6, cmll_sub(26, CTX); vmovdqu %xmm7, cmll_sub(28, CTX); vmovdqu %xmm8, cmll_sub(30, CTX); vmovdqu %xmm9, cmll_sub(32, CTX); vpshufd $0x1b, cmll_sub(22, CTX), %xmm0; vpshufd $0x1b, cmll_sub(20, CTX), %xmm1; vpshufd $0x1b, cmll_sub(18, CTX), %xmm2; vpshufd $0x1b, cmll_sub(16, CTX), %xmm3; vpshufd $0x1b, cmll_sub(14, CTX), %xmm4; vpshufd $0x1b, cmll_sub(12, CTX), %xmm5; vpshufd $0x1b, cmll_sub(10, CTX), %xmm6; vpshufd $0x1b, cmll_sub(8, CTX), %xmm7; vpxor %xmm15, %xmm0, %xmm0; vpxor %xmm15, %xmm1, %xmm1; vpxor %xmm15, %xmm2, %xmm2; /* subl(33) ^= subr(33) & ~subr(24); */ vpandn %xmm15, %xmm3, %xmm14; vpslldq $4, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm3, %xmm14; vpslld $1, %xmm14, %xmm13; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm13, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpxor %xmm15, %xmm4, %xmm4; vpxor %xmm15, %xmm5, %xmm5; vpxor %xmm15, %xmm6, %xmm6; vpshufd $0x1b, %xmm0, %xmm0; vpshufd $0x1b, %xmm1, %xmm1; vpshufd $0x1b, %xmm2, %xmm2; vpshufd $0x1b, %xmm4, %xmm4; vpshufd $0x1b, %xmm5, %xmm5; vpshufd $0x1b, %xmm6, %xmm6; vmovdqu %xmm0, cmll_sub(22, CTX); vmovdqu %xmm1, cmll_sub(20, CTX); vmovdqu %xmm2, cmll_sub(18, CTX); vmovdqu %xmm4, cmll_sub(14, CTX); vmovdqu %xmm5, cmll_sub(12, CTX); vmovdqu %xmm6, cmll_sub(10, CTX); vpshufd $0x1b, cmll_sub(6, CTX), %xmm6; vpshufd $0x1b, cmll_sub(4, CTX), %xmm4; vpshufd $0x1b, cmll_sub(2, CTX), %xmm2; vpshufd $0x1b, cmll_sub(0, CTX), %xmm0; /* subl(33) ^= subr(33) & ~subr(24); */ vpandn %xmm15, %xmm7, %xmm14; vpslldq $4, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */ vpand %xmm15, %xmm7, %xmm14; vpslld $1, %xmm14, %xmm13; vpsrld $31, %xmm14, %xmm14; vpaddd %xmm13, %xmm14, %xmm14; vpsrldq $12, %xmm14, %xmm14; vpslldq $8, %xmm14, %xmm14; vpxor %xmm14, %xmm15, %xmm15; vpxor %xmm15, %xmm6, %xmm6; vpxor %xmm15, %xmm4, %xmm4; vpxor %xmm15, %xmm2, %xmm2; vpxor %xmm15, %xmm0, %xmm0; vpshufd $0x1b, %xmm6, %xmm6; vpshufd $0x1b, %xmm4, %xmm4; vpshufd $0x1b, %xmm2, %xmm2; vpshufd $0x1b, %xmm0, %xmm0; vpsrldq $8, %xmm2, %xmm3; vpsrldq $8, %xmm4, %xmm5; vpsrldq $8, %xmm6, %xmm7; /* * key XOR is end of F-function. */ vpxor %xmm2, %xmm0, %xmm0; vpxor %xmm4, %xmm2, %xmm2; vmovq %xmm0, cmll_sub(0, CTX); vmovq %xmm3, cmll_sub(2, CTX); vpxor %xmm5, %xmm3, %xmm3; vpxor %xmm6, %xmm4, %xmm4; vpxor %xmm7, %xmm5, %xmm5; vmovq %xmm2, cmll_sub(3, CTX); vmovq %xmm3, cmll_sub(4, CTX); vmovq %xmm4, cmll_sub(5, CTX); vmovq %xmm5, cmll_sub(6, CTX); vmovq cmll_sub(7, CTX), %xmm7; vmovq cmll_sub(8, CTX), %xmm8; vmovq cmll_sub(9, CTX), %xmm9; vmovq cmll_sub(10, CTX), %xmm10; /* tl = subl(10) ^ (subr(10) & ~subr(8)); */ vpandn %xmm10, %xmm8, %xmm15; vpsrldq $4, %xmm15, %xmm15; vpxor %xmm15, %xmm10, %xmm0; /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */ vpand %xmm8, %xmm0, %xmm15; vpslld $1, %xmm15, %xmm14; vpsrld $31, %xmm15, %xmm15; vpaddd %xmm14, %xmm15, %xmm15; vpslldq $12, %xmm15, %xmm15; vpsrldq $8, %xmm15, %xmm15; vpxor %xmm15, %xmm0, %xmm0; vpxor %xmm0, %xmm6, %xmm6; vmovq %xmm6, cmll_sub(7, CTX); vmovq cmll_sub(11, CTX), %xmm11; vmovq cmll_sub(12, CTX), %xmm12; vmovq cmll_sub(13, CTX), %xmm13; vmovq cmll_sub(14, CTX), %xmm14; vmovq cmll_sub(15, CTX), %xmm15; /* tl = subl(7) ^ (subr(7) & ~subr(9)); */ vpandn %xmm7, %xmm9, %xmm1; vpsrldq $4, %xmm1, %xmm1; vpxor %xmm1, %xmm7, %xmm0; /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */ vpand %xmm9, %xmm0, %xmm1; vpslld $1, %xmm1, %xmm2; vpsrld $31, %xmm1, %xmm1; vpaddd %xmm2, %xmm1, %xmm1; vpslldq $12, %xmm1, %xmm1; vpsrldq $8, %xmm1, %xmm1; vpxor %xmm1, %xmm0, %xmm0; vpxor %xmm11, %xmm0, %xmm0; vpxor %xmm12, %xmm10, %xmm10; vpxor %xmm13, %xmm11, %xmm11; vpxor %xmm14, %xmm12, %xmm12; vpxor %xmm15, %xmm13, %xmm13; vmovq %xmm0, cmll_sub(10, CTX); vmovq %xmm10, cmll_sub(11, CTX); vmovq %xmm11, cmll_sub(12, CTX); vmovq %xmm12, cmll_sub(13, CTX); vmovq %xmm13, cmll_sub(14, CTX); vmovq cmll_sub(16, CTX), %xmm6; vmovq cmll_sub(17, CTX), %xmm7; vmovq cmll_sub(18, CTX), %xmm8; vmovq cmll_sub(19, CTX), %xmm9; vmovq cmll_sub(20, CTX), %xmm10; /* tl = subl(18) ^ (subr(18) & ~subr(16)); */ vpandn %xmm8, %xmm6, %xmm1; vpsrldq $4, %xmm1, %xmm1; vpxor %xmm1, %xmm8, %xmm0; /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */ vpand %xmm6, %xmm0, %xmm1; vpslld $1, %xmm1, %xmm2; vpsrld $31, %xmm1, %xmm1; vpaddd %xmm2, %xmm1, %xmm1; vpslldq $12, %xmm1, %xmm1; vpsrldq $8, %xmm1, %xmm1; vpxor %xmm1, %xmm0, %xmm0; vpxor %xmm14, %xmm0, %xmm0; vmovq %xmm0, cmll_sub(15, CTX); /* tl = subl(15) ^ (subr(15) & ~subr(17)); */ vpandn %xmm15, %xmm7, %xmm1; vpsrldq $4, %xmm1, %xmm1; vpxor %xmm1, %xmm15, %xmm0; /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */ vpand %xmm7, %xmm0, %xmm1; vpslld $1, %xmm1, %xmm2; vpsrld $31, %xmm1, %xmm1; vpaddd %xmm2, %xmm1, %xmm1; vpslldq $12, %xmm1, %xmm1; vpsrldq $8, %xmm1, %xmm1; vpxor %xmm1, %xmm0, %xmm0; vmovq cmll_sub(21, CTX), %xmm1; vmovq cmll_sub(22, CTX), %xmm2; vmovq cmll_sub(23, CTX), %xmm3; vmovq cmll_sub(24, CTX), %xmm4; vpxor %xmm9, %xmm0, %xmm0; vpxor %xmm10, %xmm8, %xmm8; vpxor %xmm1, %xmm9, %xmm9; vpxor %xmm2, %xmm10, %xmm10; vpxor %xmm3, %xmm1, %xmm1; vmovq %xmm0, cmll_sub(18, CTX); vmovq %xmm8, cmll_sub(19, CTX); vmovq %xmm9, cmll_sub(20, CTX); vmovq %xmm10, cmll_sub(21, CTX); vmovq %xmm1, cmll_sub(22, CTX); vmovq cmll_sub(25, CTX), %xmm5; vmovq cmll_sub(26, CTX), %xmm6; vmovq cmll_sub(27, CTX), %xmm7; vmovq cmll_sub(28, CTX), %xmm8; vmovq cmll_sub(29, CTX), %xmm9; vmovq cmll_sub(30, CTX), %xmm10; vmovq cmll_sub(31, CTX), %xmm11; vmovq cmll_sub(32, CTX), %xmm12; /* tl = subl(26) ^ (subr(26) & ~subr(24)); */ vpandn %xmm6, %xmm4, %xmm15; vpsrldq $4, %xmm15, %xmm15; vpxor %xmm15, %xmm6, %xmm0; /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */ vpand %xmm4, %xmm0, %xmm15; vpslld $1, %xmm15, %xmm14; vpsrld $31, %xmm15, %xmm15; vpaddd %xmm14, %xmm15, %xmm15; vpslldq $12, %xmm15, %xmm15; vpsrldq $8, %xmm15, %xmm15; vpxor %xmm15, %xmm0, %xmm0; vpxor %xmm0, %xmm2, %xmm2; vmovq %xmm2, cmll_sub(23, CTX); /* tl = subl(23) ^ (subr(23) & ~subr(25)); */ vpandn %xmm3, %xmm5, %xmm15; vpsrldq $4, %xmm15, %xmm15; vpxor %xmm15, %xmm3, %xmm0; /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */ vpand %xmm5, %xmm0, %xmm15; vpslld $1, %xmm15, %xmm14; vpsrld $31, %xmm15, %xmm15; vpaddd %xmm14, %xmm15, %xmm15; vpslldq $12, %xmm15, %xmm15; vpsrldq $8, %xmm15, %xmm15; vpxor %xmm15, %xmm0, %xmm0; vpxor %xmm7, %xmm0, %xmm0; vpxor %xmm8, %xmm6, %xmm6; vpxor %xmm9, %xmm7, %xmm7; vpxor %xmm10, %xmm8, %xmm8; vpxor %xmm11, %xmm9, %xmm9; vpxor %xmm12, %xmm11, %xmm11; vmovq %xmm0, cmll_sub(26, CTX); vmovq %xmm6, cmll_sub(27, CTX); vmovq %xmm7, cmll_sub(28, CTX); vmovq %xmm8, cmll_sub(29, CTX); vmovq %xmm9, cmll_sub(30, CTX); vmovq %xmm10, cmll_sub(31, CTX); vmovq %xmm11, cmll_sub(32, CTX); /* kw2 and kw4 are unused now. */ movq $0, cmll_sub(1, CTX); movq $0, cmll_sub(33, CTX); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;) .align 8 .globl _gcry_camellia_aesni_avx_keygen ELF(.type _gcry_camellia_aesni_avx_keygen,@function;) _gcry_camellia_aesni_avx_keygen: /* input: * %rdi: ctx, CTX * %rsi: key * %rdx: keylen */ CFI_STARTPROC(); vzeroupper; vmovdqu (%rsi), %xmm0; cmpl $24, %edx; jb __camellia_avx_setup128; je .Lprepare_key192; vmovdqu 16(%rsi), %xmm1; jmp __camellia_avx_setup256; .Lprepare_key192: vpcmpeqd %xmm2, %xmm2, %xmm2; vmovq 16(%rsi), %xmm1; vpxor %xmm1, %xmm2, %xmm2; vpslldq $8, %xmm2, %xmm2; vpor %xmm2, %xmm1, %xmm1; jmp __camellia_avx_setup256; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;) #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index be7bb0aa..e93c40b8 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -1,1794 +1,1794 @@ /* camellia-aesni-avx2-amd64.h - AES-NI/VAES/AVX2 implementation of Camellia * * Copyright (C) 2013-2015,2020-2021 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef GCRY_CAMELLIA_AESNI_AVX2_AMD64_H #define GCRY_CAMELLIA_AESNI_AVX2_AMD64_H #include "asm-common-amd64.h" #define CAMELLIA_TABLE_BYTE_LEN 272 /* struct CAMELLIA_context: */ #define key_table 0 #define key_bitlength CAMELLIA_TABLE_BYTE_LEN /* register macros */ #define CTX %rdi #define RIO %r8 /********************************************************************** helper macros **********************************************************************/ #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ vpand x, mask4bit, tmp0; \ vpandn x, mask4bit, x; \ vpsrld $4, x, x; \ \ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; #define ymm0_x xmm0 #define ymm1_x xmm1 #define ymm2_x xmm2 #define ymm3_x xmm3 #define ymm4_x xmm4 #define ymm5_x xmm5 #define ymm6_x xmm6 #define ymm7_x xmm7 #define ymm8_x xmm8 #define ymm9_x xmm9 #define ymm10_x xmm10 #define ymm11_x xmm11 #define ymm12_x xmm12 #define ymm13_x xmm13 #define ymm14_x xmm14 #define ymm15_x xmm15 #ifdef CAMELLIA_VAES_BUILD # define IF_AESNI(...) # define IF_VAES(...) __VA_ARGS__ #else # define IF_AESNI(...) __VA_ARGS__ # define IF_VAES(...) #endif /********************************************************************** 32-way camellia **********************************************************************/ /* * IN: * x0..x7: byte-sliced AB state * mem_cd: register pointer storing CD state * key: index for key material * OUT: * x0..x7: new byte-sliced CD state */ #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \ t6, t7, mem_cd, key) \ /* \ * S-function with AES subbytes \ */ \ vbroadcasti128 .Linv_shift_row rRIP, t4; \ vpbroadcastd .L0f0f0f0f rRIP, t7; \ vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \ vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \ vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \ vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ vpshufb t4, x7, x7; \ vpshufb t4, x3, x3; \ vpshufb t4, x6, x6; \ vpshufb t4, x2, x2; \ vpshufb t4, x5, x5; \ vpshufb t4, x1, x1; \ vpshufb t4, x4, x4; \ \ /* prefilter sboxes 1, 2 and 3 */ \ /* prefilter sbox 4 */ \ filter_8bit(x0, t5, t6, t7, t4); \ filter_8bit(x7, t5, t6, t7, t4); \ IF_AESNI(vextracti128 $1, x0, t0##_x); \ IF_AESNI(vextracti128 $1, x7, t1##_x); \ filter_8bit(x3, t2, t3, t7, t4); \ filter_8bit(x6, t2, t3, t7, t4); \ IF_AESNI(vextracti128 $1, x3, t3##_x); \ IF_AESNI(vextracti128 $1, x6, t2##_x); \ filter_8bit(x2, t5, t6, t7, t4); \ filter_8bit(x5, t5, t6, t7, t4); \ filter_8bit(x1, t5, t6, t7, t4); \ filter_8bit(x4, t5, t6, t7, t4); \ \ vpxor t4##_x, t4##_x, t4##_x; \ \ /* AES subbytes + AES shift rows */ \ IF_AESNI(vextracti128 $1, x2, t6##_x; \ vextracti128 $1, x5, t5##_x; \ vaesenclast t4##_x, x0##_x, x0##_x; \ vaesenclast t4##_x, t0##_x, t0##_x; \ vaesenclast t4##_x, x7##_x, x7##_x; \ vaesenclast t4##_x, t1##_x, t1##_x; \ vaesenclast t4##_x, x3##_x, x3##_x; \ vaesenclast t4##_x, t3##_x, t3##_x; \ vaesenclast t4##_x, x6##_x, x6##_x; \ vaesenclast t4##_x, t2##_x, t2##_x; \ vinserti128 $1, t0##_x, x0, x0; \ vinserti128 $1, t1##_x, x7, x7; \ vinserti128 $1, t3##_x, x3, x3; \ vinserti128 $1, t2##_x, x6, x6; \ vextracti128 $1, x1, t3##_x; \ vextracti128 $1, x4, t2##_x); \ vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \ vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \ IF_AESNI(vaesenclast t4##_x, x2##_x, x2##_x; \ vaesenclast t4##_x, t6##_x, t6##_x; \ vaesenclast t4##_x, x5##_x, x5##_x; \ vaesenclast t4##_x, t5##_x, t5##_x; \ vaesenclast t4##_x, x1##_x, x1##_x; \ vaesenclast t4##_x, t3##_x, t3##_x; \ vaesenclast t4##_x, x4##_x, x4##_x; \ vaesenclast t4##_x, t2##_x, t2##_x; \ vinserti128 $1, t6##_x, x2, x2; \ vinserti128 $1, t5##_x, x5, x5; \ vinserti128 $1, t3##_x, x1, x1; \ vinserti128 $1, t2##_x, x4, x4); \ IF_VAES(vaesenclast t4, x0, x0; \ vaesenclast t4, x7, x7; \ vaesenclast t4, x3, x3; \ vaesenclast t4, x6, x6; \ vaesenclast t4, x2, x2; \ vaesenclast t4, x5, x5; \ vaesenclast t4, x1, x1; \ vaesenclast t4, x4, x4); \ \ /* postfilter sboxes 1 and 4 */ \ vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \ vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \ filter_8bit(x0, t0, t1, t7, t4); \ filter_8bit(x7, t0, t1, t7, t4); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \ vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ \ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ vpxor t7, t7, t7; \ \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ vpshufb t7, t1, t1; \ vpsrldq $3, t0, t3; \ \ /* P-function */ \ vpxor x5, x0, x0; \ vpxor x6, x1, x1; \ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ vpshufb t7, t2, t2; \ vpsrldq $4, t0, t4; \ vpshufb t7, t3, t3; \ vpsrldq $5, t0, t5; \ vpshufb t7, t4, t4; \ \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ vpsrldq $6, t0, t6; \ vpshufb t7, t5, t5; \ vpshufb t7, t6, t6; \ \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ vpxor x6, x3, x3; \ \ vpxor x3, x4, x4; \ vpxor x0, x5, x5; \ vpxor x1, x6, x6; \ vpxor x2, x7, x7; /* note: high and low parts swapped */ \ \ /* Add key material and result to CD (x becomes new CD) */ \ \ vpxor t6, x1, x1; \ vpxor 5 * 32(mem_cd), x1, x1; \ \ vpsrldq $7, t0, t6; \ vpshufb t7, t0, t0; \ vpshufb t7, t6, t7; \ \ vpxor t7, x0, x0; \ vpxor 4 * 32(mem_cd), x0, x0; \ \ vpxor t5, x2, x2; \ vpxor 6 * 32(mem_cd), x2, x2; \ \ vpxor t4, x3, x3; \ vpxor 7 * 32(mem_cd), x3, x3; \ \ vpxor t3, x4, x4; \ vpxor 0 * 32(mem_cd), x4, x4; \ \ vpxor t2, x5, x5; \ vpxor 1 * 32(mem_cd), x5, x5; \ \ vpxor t1, x6, x6; \ vpxor 2 * 32(mem_cd), x6, x6; \ \ vpxor t0, x7, x7; \ vpxor 3 * 32(mem_cd), x7, x7; /* * IN/OUT: * x0..x7: byte-sliced AB state preloaded * mem_ab: byte-sliced AB state in memory * mem_cb: byte-sliced CD state in memory */ #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \ \ vmovdqu x0, 4 * 32(mem_cd); \ vmovdqu x1, 5 * 32(mem_cd); \ vmovdqu x2, 6 * 32(mem_cd); \ vmovdqu x3, 7 * 32(mem_cd); \ vmovdqu x4, 0 * 32(mem_cd); \ vmovdqu x5, 1 * 32(mem_cd); \ vmovdqu x6, 2 * 32(mem_cd); \ vmovdqu x7, 3 * 32(mem_cd); \ \ roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \ \ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ /* Store new AB state */ \ vmovdqu x4, 4 * 32(mem_ab); \ vmovdqu x5, 5 * 32(mem_ab); \ vmovdqu x6, 6 * 32(mem_ab); \ vmovdqu x7, 7 * 32(mem_ab); \ vmovdqu x0, 0 * 32(mem_ab); \ vmovdqu x1, 1 * 32(mem_ab); \ vmovdqu x2, 2 * 32(mem_ab); \ vmovdqu x3, 3 * 32(mem_ab); #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i) \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, i) \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); /* * IN: * v0..3: byte-sliced 32-bit integers * OUT: * v0..3: (IN <<< 1) */ #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ vpcmpgtb v0, zero, t0; \ vpaddb v0, v0, v0; \ vpabsb t0, t0; \ \ vpcmpgtb v1, zero, t1; \ vpaddb v1, v1, v1; \ vpabsb t1, t1; \ \ vpcmpgtb v2, zero, t2; \ vpaddb v2, v2, v2; \ vpabsb t2, t2; \ \ vpor t0, v1, v1; \ \ vpcmpgtb v3, zero, t0; \ vpaddb v3, v3, v3; \ vpabsb t0, t0; \ \ vpor t1, v2, v2; \ vpor t2, v3, v3; \ vpor t0, v0, v0; /* * IN: * r: byte-sliced AB state in memory * l: byte-sliced CD state in memory * OUT: * x0..x7: new byte-sliced CD state */ #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ tt1, tt2, tt3, kll, klr, krl, krr) \ /* \ * t0 = kll; \ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ vpxor tt0, tt0, tt0; \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpand l0, t0, t0; \ vpand l1, t1, t1; \ vpand l2, t2, t2; \ vpand l3, t3, t3; \ \ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor l4, t0, l4; \ vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ vmovdqu l4, 4 * 32(l); \ vpxor l5, t1, l5; \ vmovdqu l5, 5 * 32(l); \ vpxor l6, t2, l6; \ vmovdqu l6, 6 * 32(l); \ vpxor l7, t3, l7; \ vmovdqu l7, 7 * 32(l); \ \ /* \ * t2 = krr; \ * t2 |= rr; \ * rl ^= t2; \ */ \ \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpor 4 * 32(r), t0, t0; \ vpor 5 * 32(r), t1, t1; \ vpor 6 * 32(r), t2, t2; \ vpor 7 * 32(r), t3, t3; \ \ vpxor 0 * 32(r), t0, t0; \ vpxor 1 * 32(r), t1, t1; \ vpxor 2 * 32(r), t2, t2; \ vpxor 3 * 32(r), t3, t3; \ vmovdqu t0, 0 * 32(r); \ vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 1 * 32(r); \ vmovdqu t2, 2 * 32(r); \ vmovdqu t3, 3 * 32(r); \ \ /* \ * t2 = krl; \ * t2 &= rl; \ * rr ^= rol32(t2, 1); \ */ \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpand 0 * 32(r), t0, t0; \ vpand 1 * 32(r), t1, t1; \ vpand 2 * 32(r), t2, t2; \ vpand 3 * 32(r), t3, t3; \ \ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor 4 * 32(r), t0, t0; \ vpxor 5 * 32(r), t1, t1; \ vpxor 6 * 32(r), t2, t2; \ vpxor 7 * 32(r), t3, t3; \ vmovdqu t0, 4 * 32(r); \ vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 5 * 32(r); \ vmovdqu t2, 6 * 32(r); \ vmovdqu t3, 7 * 32(r); \ \ /* \ * t0 = klr; \ * t0 |= lr; \ * ll ^= t0; \ */ \ \ vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ vpshufb tt0, t0, t0; \ \ vpor l4, t0, t0; \ vpor l5, t1, t1; \ vpor l6, t2, t2; \ vpor l7, t3, t3; \ \ vpxor l0, t0, l0; \ vmovdqu l0, 0 * 32(l); \ vpxor l1, t1, l1; \ vmovdqu l1, 1 * 32(l); \ vpxor l2, t2, l2; \ vmovdqu l2, 2 * 32(l); \ vpxor l3, t3, l3; \ vmovdqu l3, 3 * 32(l); #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ a3, b3, c3, d3, st0, st1) \ vmovdqu d2, st0; \ vmovdqu d3, st1; \ transpose_4x4(a0, a1, a2, a3, d2, d3); \ transpose_4x4(b0, b1, b2, b3, d2, d3); \ vmovdqu st0, d2; \ vmovdqu st1, d3; \ \ vmovdqu a0, st0; \ vmovdqu a1, st1; \ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ vbroadcasti128 .Lshufb_16x16b rRIP, a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ vpshufb a0, b0, b0; \ vpshufb a0, b1, b1; \ vpshufb a0, b2, b2; \ vpshufb a0, b3, b3; \ vpshufb a0, a1, a1; \ vpshufb a0, c0, c0; \ vpshufb a0, c1, c1; \ vpshufb a0, c2, c2; \ vpshufb a0, c3, c3; \ vpshufb a0, d0, d0; \ vpshufb a0, d1, d1; \ vpshufb a0, d2, d2; \ vpshufb a0, d3, d3; \ vmovdqu d3, st1; \ vmovdqu st0, d3; \ vpshufb a0, d3, a0; \ vmovdqu d2, st0; \ \ transpose_4x4(a0, b0, c0, d0, d2, d3); \ transpose_4x4(a1, b1, c1, d1, d2, d3); \ vmovdqu st0, d2; \ vmovdqu st1, d3; \ \ vmovdqu b0, st0; \ vmovdqu b1, st1; \ transpose_4x4(a2, b2, c2, d2, b0, b1); \ transpose_4x4(a3, b3, c3, d3, b0, b1); \ vmovdqu st0, b0; \ vmovdqu st1, b1; \ /* does not adjust output bytes inside vectors */ /* load blocks to registers and apply pre-whitening */ #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vpbroadcastq key, x0; \ vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor 0 * 32(rio), x0, y7; \ vpxor 1 * 32(rio), x0, y6; \ vpxor 2 * 32(rio), x0, y5; \ vpxor 3 * 32(rio), x0, y4; \ vpxor 4 * 32(rio), x0, y3; \ vpxor 5 * 32(rio), x0, y2; \ vpxor 6 * 32(rio), x0, y1; \ vpxor 7 * 32(rio), x0, y0; \ vpxor 8 * 32(rio), x0, x7; \ vpxor 9 * 32(rio), x0, x6; \ vpxor 10 * 32(rio), x0, x5; \ vpxor 11 * 32(rio), x0, x4; \ vpxor 12 * 32(rio), x0, x3; \ vpxor 13 * 32(rio), x0, x2; \ vpxor 14 * 32(rio), x0, x1; \ vpxor 15 * 32(rio), x0, x0; /* byteslice pre-whitened blocks and store to temporary memory */ #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, mem_ab, mem_cd) \ byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ y4, y5, y6, y7, (mem_ab), (mem_cd)); \ \ vmovdqu x0, 0 * 32(mem_ab); \ vmovdqu x1, 1 * 32(mem_ab); \ vmovdqu x2, 2 * 32(mem_ab); \ vmovdqu x3, 3 * 32(mem_ab); \ vmovdqu x4, 4 * 32(mem_ab); \ vmovdqu x5, 5 * 32(mem_ab); \ vmovdqu x6, 6 * 32(mem_ab); \ vmovdqu x7, 7 * 32(mem_ab); \ vmovdqu y0, 0 * 32(mem_cd); \ vmovdqu y1, 1 * 32(mem_cd); \ vmovdqu y2, 2 * 32(mem_cd); \ vmovdqu y3, 3 * 32(mem_cd); \ vmovdqu y4, 4 * 32(mem_cd); \ vmovdqu y5, 5 * 32(mem_cd); \ vmovdqu y6, 6 * 32(mem_cd); \ vmovdqu y7, 7 * 32(mem_cd); /* de-byteslice, apply post-whitening and store blocks */ #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ y5, y6, y7, key, stack_tmp0, stack_tmp1) \ byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ \ vmovdqu x0, stack_tmp0; \ \ vpbroadcastq key, x0; \ vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ vpxor x0, y5, y5; \ vpxor x0, y4, y4; \ vpxor x0, y3, y3; \ vpxor x0, y2, y2; \ vpxor x0, y1, y1; \ vpxor x0, y0, y0; \ vpxor x0, x7, x7; \ vpxor x0, x6, x6; \ vpxor x0, x5, x5; \ vpxor x0, x4, x4; \ vpxor x0, x3, x3; \ vpxor x0, x2, x2; \ vpxor x0, x1, x1; \ vpxor stack_tmp0, x0, x0; #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio) \ vmovdqu x0, 0 * 32(rio); \ vmovdqu x1, 1 * 32(rio); \ vmovdqu x2, 2 * 32(rio); \ vmovdqu x3, 3 * 32(rio); \ vmovdqu x4, 4 * 32(rio); \ vmovdqu x5, 5 * 32(rio); \ vmovdqu x6, 6 * 32(rio); \ vmovdqu x7, 7 * 32(rio); \ vmovdqu y0, 8 * 32(rio); \ vmovdqu y1, 9 * 32(rio); \ vmovdqu y2, 10 * 32(rio); \ vmovdqu y3, 11 * 32(rio); \ vmovdqu y4, 12 * 32(rio); \ vmovdqu y5, 13 * 32(rio); \ vmovdqu y6, 14 * 32(rio); \ vmovdqu y7, 15 * 32(rio); .text .align 32 #define SHUFB_BYTES(idx) \ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) .Lshufb_16x16b: .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) .Lpack_bswap: .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 /* * pre-SubByte transform * * pre-lookup for sbox1, sbox2, sbox3: * swap_bitendianness( * isom_map_camellia_to_aes( * camellia_f( * swap_bitendianess(in) * ) * ) * ) * * (note: '⊕ 0xc5' inside camellia_f()) */ .Lpre_tf_lo_s1: .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 .Lpre_tf_hi_s1: .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 /* * pre-SubByte transform * * pre-lookup for sbox4: * swap_bitendianness( * isom_map_camellia_to_aes( * camellia_f( * swap_bitendianess(in <<< 1) * ) * ) * ) * * (note: '⊕ 0xc5' inside camellia_f()) */ .Lpre_tf_lo_s4: .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 .Lpre_tf_hi_s4: .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf /* * post-SubByte transform * * post-lookup for sbox1, sbox4: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s1: .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 .Lpost_tf_hi_s1: .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c /* * post-SubByte transform * * post-lookup for sbox2: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) <<< 1 * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s2: .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 .Lpost_tf_hi_s2: .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 /* * post-SubByte transform * * post-lookup for sbox3: * swap_bitendianness( * camellia_h( * isom_map_aes_to_camellia( * swap_bitendianness( * aes_inverse_affine_transform(in) * ) * ) * ) * ) >>> 1 * * (note: '⊕ 0x6e' inside camellia_h()) */ .Lpost_tf_lo_s3: .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 .Lpost_tf_hi_s3: .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 /* For isolating SubBytes from AESENCLAST, inverse shift row */ .Linv_shift_row: .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 .align 4 /* 4-bit mask */ .L0f0f0f0f: .long 0x0f0f0f0f .align 8 ELF(.type __camellia_enc_blk32,@function;) __camellia_enc_blk32: /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes * %r8d: 24 for 16 byte key, 32 for larger * %ymm0..%ymm15: 32 plaintext blocks * output: * %ymm0..%ymm15: 32 encrypted blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ CFI_STARTPROC(); leaq 8 * 32(%rax), %rcx; leaq (-8 * 8)(CTX, %r8, 8), %r8; inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx); .align 8 .Lenc_loop: enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx, 0); cmpq %r8, CTX; je .Lenc_done; leaq (8 * 8)(CTX), CTX; fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, ((key_table) + 0)(CTX), ((key_table) + 4)(CTX), ((key_table) + 8)(CTX), ((key_table) + 12)(CTX)); jmp .Lenc_loop; .align 8 .Lenc_done: /* load CD for output */ vmovdqu 0 * 32(%rcx), %ymm8; vmovdqu 1 * 32(%rcx), %ymm9; vmovdqu 2 * 32(%rcx), %ymm10; vmovdqu 3 * 32(%rcx), %ymm11; vmovdqu 4 * 32(%rcx), %ymm12; vmovdqu 5 * 32(%rcx), %ymm13; vmovdqu 6 * 32(%rcx), %ymm14; vmovdqu 7 * 32(%rcx), %ymm15; outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) .align 8 ELF(.type __camellia_dec_blk32,@function;) __camellia_dec_blk32: /* input: * %rdi: ctx, CTX * %rax: temporary storage, 512 bytes * %r8d: 24 for 16 byte key, 32 for larger * %ymm0..%ymm15: 16 encrypted blocks * output: * %ymm0..%ymm15: 16 plaintext blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ CFI_STARTPROC(); movq %r8, %rcx; movq CTX, %r8 leaq (-8 * 8)(CTX, %rcx, 8), CTX; leaq 8 * 32(%rax), %rcx; inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx); .align 8 .Ldec_loop: dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rax, %rcx, 0); cmpq %r8, CTX; je .Ldec_done; fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, ((key_table) + 8)(CTX), ((key_table) + 12)(CTX), ((key_table) + 0)(CTX), ((key_table) + 4)(CTX)); leaq (-8 * 8)(CTX), CTX; jmp .Ldec_loop; .align 8 .Ldec_done: /* load CD for output */ vmovdqu 0 * 32(%rcx), %ymm8; vmovdqu 1 * 32(%rcx), %ymm9; vmovdqu 2 * 32(%rcx), %ymm10; vmovdqu 3 * 32(%rcx), %ymm11; vmovdqu 4 * 32(%rcx), %ymm12; vmovdqu 5 * 32(%rcx), %ymm13; vmovdqu 6 * 32(%rcx), %ymm14; vmovdqu 7 * 32(%rcx), %ymm15; outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; .align 8 .globl FUNC_NAME(ctr_enc) ELF(.type FUNC_NAME(ctr_enc),@function;) FUNC_NAME(ctr_enc): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); movq 8(%rcx), %r11; bswapq %r11; vzeroupper; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; vpcmpeqd %ymm15, %ymm15, %ymm15; vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ /* load IV and byteswap */ vmovdqu (%rcx), %xmm0; vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0; vmovdqa %xmm0, %xmm1; inc_le128(%xmm0, %xmm15, %xmm14); vbroadcasti128 .Lbswap128_mask rRIP, %ymm14; vinserti128 $1, %xmm0, %ymm1, %ymm0; vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 15 * 32(%rax); /* check need for handling 64-bit overflow and carry */ cmpq $(0xffffffffffffffff - 32), %r11; ja .Lload_ctr_carry; /* construct IVs */ vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */ vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 14 * 32(%rax); vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 13 * 32(%rax); vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm12; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm11; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm10; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm9; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm8; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm7; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm6; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm5; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm4; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm3; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm2; vpsubq %ymm15, %ymm0, %ymm0; vpshufb %ymm14, %ymm0, %ymm1; vpsubq %ymm15, %ymm0, %ymm0; /* +30 ; +31 */ vpsubq %xmm15, %xmm0, %xmm13; /* +32 */ vpshufb %ymm14, %ymm0, %ymm0; vpshufb %xmm14, %xmm13, %xmm13; vmovdqu %xmm13, (%rcx); jmp .Lload_ctr_done; .align 4 .Lload_ctr_carry: /* construct IVs */ inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */ inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */ vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 14 * 32(%rax); inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 13 * 32(%rax); inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm12; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm11; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm10; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm9; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm8; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm7; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm6; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm5; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm4; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm3; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm2; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vpshufb %ymm14, %ymm0, %ymm1; inc_le128(%ymm0, %ymm15, %ymm13); inc_le128(%ymm0, %ymm15, %ymm13); vextracti128 $1, %ymm0, %xmm13; vpshufb %ymm14, %ymm0, %ymm0; inc_le128(%xmm13, %xmm15, %xmm14); vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; vmovdqu %xmm13, (%rcx); .align 4 .Lload_ctr_done: /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor 13 * 32(%rax), %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; call __camellia_enc_blk32; vpxor 0 * 32(%rdx), %ymm7, %ymm7; vpxor 1 * 32(%rdx), %ymm6, %ymm6; vpxor 2 * 32(%rdx), %ymm5, %ymm5; vpxor 3 * 32(%rdx), %ymm4, %ymm4; vpxor 4 * 32(%rdx), %ymm3, %ymm3; vpxor 5 * 32(%rdx), %ymm2, %ymm2; vpxor 6 * 32(%rdx), %ymm1, %ymm1; vpxor 7 * 32(%rdx), %ymm0, %ymm0; vpxor 8 * 32(%rdx), %ymm15, %ymm15; vpxor 9 * 32(%rdx), %ymm14, %ymm14; vpxor 10 * 32(%rdx), %ymm13, %ymm13; vpxor 11 * 32(%rdx), %ymm12, %ymm12; vpxor 12 * 32(%rdx), %ymm11, %ymm11; vpxor 13 * 32(%rdx), %ymm10, %ymm10; vpxor 14 * 32(%rdx), %ymm9, %ymm9; vpxor 15 * 32(%rdx), %ymm8, %ymm8; leaq 32 * 16(%rdx), %rdx; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);) .align 8 .globl FUNC_NAME(cbc_dec) ELF(.type FUNC_NAME(cbc_dec),@function;) FUNC_NAME(cbc_dec): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: iv */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; movq %rcx, %r9; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rdx, (key_table)(CTX, %r8, 8)); call __camellia_dec_blk32; /* XOR output with IV */ vmovdqu %ymm8, (%rax); vmovdqu (%r9), %xmm8; vinserti128 $1, (%rdx), %ymm8, %ymm8; vpxor %ymm8, %ymm7, %ymm7; vmovdqu (%rax), %ymm8; vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; movq (15 * 32 + 16 + 0)(%rdx), %rax; movq (15 * 32 + 16 + 8)(%rdx), %rcx; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); /* store new IV */ movq %rax, (0)(%r9); movq %rcx, (8)(%r9); vzeroall; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);) .align 8 .globl FUNC_NAME(cfb_dec) ELF(.type FUNC_NAME(cfb_dec),@function;) FUNC_NAME(cfb_dec): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: iv */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; cmovel %eax, %r8d; /* max */ subq $(16 * 32), %rsp; andq $~63, %rsp; movq %rsp, %rax; /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm0; vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0; vmovdqu (%rcx), %xmm15; vinserti128 $1, (%rdx), %ymm15, %ymm15; vpxor %ymm15, %ymm0, %ymm15; vmovdqu (15 * 32 + 16)(%rdx), %xmm1; vmovdqu %xmm1, (%rcx); /* store new IV */ vpxor (0 * 32 + 16)(%rdx), %ymm0, %ymm14; vpxor (1 * 32 + 16)(%rdx), %ymm0, %ymm13; vpxor (2 * 32 + 16)(%rdx), %ymm0, %ymm12; vpxor (3 * 32 + 16)(%rdx), %ymm0, %ymm11; vpxor (4 * 32 + 16)(%rdx), %ymm0, %ymm10; vpxor (5 * 32 + 16)(%rdx), %ymm0, %ymm9; vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm8; vpxor (7 * 32 + 16)(%rdx), %ymm0, %ymm7; vpxor (8 * 32 + 16)(%rdx), %ymm0, %ymm6; vpxor (9 * 32 + 16)(%rdx), %ymm0, %ymm5; vpxor (10 * 32 + 16)(%rdx), %ymm0, %ymm4; vpxor (11 * 32 + 16)(%rdx), %ymm0, %ymm3; vpxor (12 * 32 + 16)(%rdx), %ymm0, %ymm2; vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1; vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0; call __camellia_enc_blk32; vpxor 0 * 32(%rdx), %ymm7, %ymm7; vpxor 1 * 32(%rdx), %ymm6, %ymm6; vpxor 2 * 32(%rdx), %ymm5, %ymm5; vpxor 3 * 32(%rdx), %ymm4, %ymm4; vpxor 4 * 32(%rdx), %ymm3, %ymm3; vpxor 5 * 32(%rdx), %ymm2, %ymm2; vpxor 6 * 32(%rdx), %ymm1, %ymm1; vpxor 7 * 32(%rdx), %ymm0, %ymm0; vpxor 8 * 32(%rdx), %ymm15, %ymm15; vpxor 9 * 32(%rdx), %ymm14, %ymm14; vpxor 10 * 32(%rdx), %ymm13, %ymm13; vpxor 11 * 32(%rdx), %ymm12, %ymm12; vpxor 12 * 32(%rdx), %ymm11, %ymm11; vpxor 13 * 32(%rdx), %ymm10, %ymm10; vpxor 14 * 32(%rdx), %ymm9, %ymm9; vpxor 15 * 32(%rdx), %ymm8, %ymm8; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);) .align 8 .globl FUNC_NAME(ocb_enc) ELF(.type FUNC_NAME(ocb_enc),@function;) FUNC_NAME(ocb_enc): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[32]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; subq $(16 * 32 + 4 * 8), %rsp; andq $~63, %rsp; movq %rsp, %rax; movq %r10, (16 * 32 + 0 * 8)(%rsp); movq %r11, (16 * 32 + 1 * 8)(%rsp); movq %r12, (16 * 32 + 2 * 8)(%rsp); movq %r13, (16 * 32 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rcx), %xmm14; vmovdqu (%r8), %xmm13; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), %xmm14, %xmm15; \ vpxor (l1reg), %xmm15, %xmm14; \ vinserti128 $1, %xmm14, %ymm15, %ymm15; \ vpxor yreg, %ymm13, %ymm13; \ vpxor yreg, %ymm15, yreg; \ vmovdqu %ymm15, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, %ymm0); vmovdqu %ymm0, (15 * 32)(%rax); OCB_INPUT(1, %r12, %r13, %ymm0); vmovdqu %ymm0, (14 * 32)(%rax); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, %ymm0); vmovdqu %ymm0, (13 * 32)(%rax); OCB_INPUT(3, %r12, %r13, %ymm12); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, %ymm11); OCB_INPUT(5, %r12, %r13, %ymm10); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, %ymm9); OCB_INPUT(7, %r12, %r13, %ymm8); movq (16 * 8)(%r9), %r10; movq (17 * 8)(%r9), %r11; movq (18 * 8)(%r9), %r12; movq (19 * 8)(%r9), %r13; OCB_INPUT(8, %r10, %r11, %ymm7); OCB_INPUT(9, %r12, %r13, %ymm6); movq (20 * 8)(%r9), %r10; movq (21 * 8)(%r9), %r11; movq (22 * 8)(%r9), %r12; movq (23 * 8)(%r9), %r13; OCB_INPUT(10, %r10, %r11, %ymm5); OCB_INPUT(11, %r12, %r13, %ymm4); movq (24 * 8)(%r9), %r10; movq (25 * 8)(%r9), %r11; movq (26 * 8)(%r9), %r12; movq (27 * 8)(%r9), %r13; OCB_INPUT(12, %r10, %r11, %ymm3); OCB_INPUT(13, %r12, %r13, %ymm2); movq (28 * 8)(%r9), %r10; movq (29 * 8)(%r9), %r11; movq (30 * 8)(%r9), %r12; movq (31 * 8)(%r9), %r13; OCB_INPUT(14, %r10, %r11, %ymm1); OCB_INPUT(15, %r12, %r13, %ymm0); #undef OCB_INPUT vextracti128 $1, %ymm13, %xmm15; vmovdqu %xmm14, (%rcx); vpxor %xmm13, %xmm15, %xmm15; vmovdqu %xmm15, (%r8); cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r10d; cmovel %r10d, %r8d; /* max */ /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor 13 * 32(%rax), %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; call __camellia_enc_blk32; vpxor 0 * 32(%rsi), %ymm7, %ymm7; vpxor 1 * 32(%rsi), %ymm6, %ymm6; vpxor 2 * 32(%rsi), %ymm5, %ymm5; vpxor 3 * 32(%rsi), %ymm4, %ymm4; vpxor 4 * 32(%rsi), %ymm3, %ymm3; vpxor 5 * 32(%rsi), %ymm2, %ymm2; vpxor 6 * 32(%rsi), %ymm1, %ymm1; vpxor 7 * 32(%rsi), %ymm0, %ymm0; vpxor 8 * 32(%rsi), %ymm15, %ymm15; vpxor 9 * 32(%rsi), %ymm14, %ymm14; vpxor 10 * 32(%rsi), %ymm13, %ymm13; vpxor 11 * 32(%rsi), %ymm12, %ymm12; vpxor 12 * 32(%rsi), %ymm11, %ymm11; vpxor 13 * 32(%rsi), %ymm10, %ymm10; vpxor 14 * 32(%rsi), %ymm9, %ymm9; vpxor 15 * 32(%rsi), %ymm8, %ymm8; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; movq (16 * 32 + 0 * 8)(%rsp), %r10; movq (16 * 32 + 1 * 8)(%rsp), %r11; movq (16 * 32 + 2 * 8)(%rsp), %r12; movq (16 * 32 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);) .align 8 .globl FUNC_NAME(ocb_dec) ELF(.type FUNC_NAME(ocb_dec),@function;) FUNC_NAME(ocb_dec): /* input: * %rdi: ctx, CTX * %rsi: dst (32 blocks) * %rdx: src (32 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[32]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; subq $(16 * 32 + 4 * 8), %rsp; andq $~63, %rsp; movq %rsp, %rax; movq %r10, (16 * 32 + 0 * 8)(%rsp); movq %r11, (16 * 32 + 1 * 8)(%rsp); movq %r12, (16 * 32 + 2 * 8)(%rsp); movq %r13, (16 * 32 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rcx), %xmm14; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), %xmm14, %xmm15; \ vpxor (l1reg), %xmm15, %xmm14; \ vinserti128 $1, %xmm14, %ymm15, %ymm15; \ vpxor yreg, %ymm15, yreg; \ vmovdqu %ymm15, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, %ymm0); vmovdqu %ymm0, (15 * 32)(%rax); OCB_INPUT(1, %r12, %r13, %ymm0); vmovdqu %ymm0, (14 * 32)(%rax); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, %ymm13); OCB_INPUT(3, %r12, %r13, %ymm12); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, %ymm11); OCB_INPUT(5, %r12, %r13, %ymm10); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, %ymm9); OCB_INPUT(7, %r12, %r13, %ymm8); movq (16 * 8)(%r9), %r10; movq (17 * 8)(%r9), %r11; movq (18 * 8)(%r9), %r12; movq (19 * 8)(%r9), %r13; OCB_INPUT(8, %r10, %r11, %ymm7); OCB_INPUT(9, %r12, %r13, %ymm6); movq (20 * 8)(%r9), %r10; movq (21 * 8)(%r9), %r11; movq (22 * 8)(%r9), %r12; movq (23 * 8)(%r9), %r13; OCB_INPUT(10, %r10, %r11, %ymm5); OCB_INPUT(11, %r12, %r13, %ymm4); movq (24 * 8)(%r9), %r10; movq (25 * 8)(%r9), %r11; movq (26 * 8)(%r9), %r12; movq (27 * 8)(%r9), %r13; OCB_INPUT(12, %r10, %r11, %ymm3); OCB_INPUT(13, %r12, %r13, %ymm2); movq (28 * 8)(%r9), %r10; movq (29 * 8)(%r9), %r11; movq (30 * 8)(%r9), %r12; movq (31 * 8)(%r9), %r13; OCB_INPUT(14, %r10, %r11, %ymm1); OCB_INPUT(15, %r12, %r13, %ymm0); #undef OCB_INPUT vmovdqu %xmm14, (%rcx); movq %r8, %r10; cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r9d; cmovel %r9d, %r8d; /* max */ /* inpack16_pre: */ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor %ymm13, %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; call __camellia_dec_blk32; vpxor 0 * 32(%rsi), %ymm7, %ymm7; vpxor 1 * 32(%rsi), %ymm6, %ymm6; vpxor 2 * 32(%rsi), %ymm5, %ymm5; vpxor 3 * 32(%rsi), %ymm4, %ymm4; vpxor 4 * 32(%rsi), %ymm3, %ymm3; vpxor 5 * 32(%rsi), %ymm2, %ymm2; vpxor 6 * 32(%rsi), %ymm1, %ymm1; vpxor 7 * 32(%rsi), %ymm0, %ymm0; vmovdqu %ymm7, (7 * 32)(%rax); vmovdqu %ymm6, (6 * 32)(%rax); vpxor 8 * 32(%rsi), %ymm15, %ymm15; vpxor 9 * 32(%rsi), %ymm14, %ymm14; vpxor 10 * 32(%rsi), %ymm13, %ymm13; vpxor 11 * 32(%rsi), %ymm12, %ymm12; vpxor 12 * 32(%rsi), %ymm11, %ymm11; vpxor 13 * 32(%rsi), %ymm10, %ymm10; vpxor 14 * 32(%rsi), %ymm9, %ymm9; vpxor 15 * 32(%rsi), %ymm8, %ymm8; /* Checksum_i = Checksum_{i-1} xor P_i */ vpxor %ymm5, %ymm7, %ymm7; vpxor %ymm4, %ymm6, %ymm6; vpxor %ymm3, %ymm7, %ymm7; vpxor %ymm2, %ymm6, %ymm6; vpxor %ymm1, %ymm7, %ymm7; vpxor %ymm0, %ymm6, %ymm6; vpxor %ymm15, %ymm7, %ymm7; vpxor %ymm14, %ymm6, %ymm6; vpxor %ymm13, %ymm7, %ymm7; vpxor %ymm12, %ymm6, %ymm6; vpxor %ymm11, %ymm7, %ymm7; vpxor %ymm10, %ymm6, %ymm6; vpxor %ymm9, %ymm7, %ymm7; vpxor %ymm8, %ymm6, %ymm6; vpxor %ymm7, %ymm6, %ymm7; vextracti128 $1, %ymm7, %xmm6; vpxor %xmm6, %xmm7, %xmm7; vpxor (%r10), %xmm7, %xmm7; vmovdqu %xmm7, (%r10); vmovdqu 7 * 32(%rax), %ymm7; vmovdqu 6 * 32(%rax), %ymm6; write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, %ymm8, %rsi); vzeroall; movq (16 * 32 + 0 * 8)(%rsp), %r10; movq (16 * 32 + 1 * 8)(%rsp), %r11; movq (16 * 32 + 2 * 8)(%rsp), %r12; movq (16 * 32 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);) .align 8 .globl FUNC_NAME(ocb_auth) ELF(.type FUNC_NAME(ocb_auth),@function;) FUNC_NAME(ocb_auth): /* input: * %rdi: ctx, CTX * %rsi: abuf (16 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; subq $(16 * 32 + 4 * 8), %rsp; andq $~63, %rsp; movq %rsp, %rax; movq %r10, (16 * 32 + 0 * 8)(%rsp); movq %r11, (16 * 32 + 1 * 8)(%rsp); movq %r12, (16 * 32 + 2 * 8)(%rsp); movq %r13, (16 * 32 + 3 * 8)(%rsp); CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rdx), %xmm14; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rsi), yreg; \ vpxor (l0reg), %xmm14, %xmm15; \ vpxor (l1reg), %xmm15, %xmm14; \ vinserti128 $1, %xmm14, %ymm15, %ymm15; \ vpxor yreg, %ymm15, yreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, %r11, %ymm0); vmovdqu %ymm0, (15 * 32)(%rax); OCB_INPUT(1, %r12, %r13, %ymm0); vmovdqu %ymm0, (14 * 32)(%rax); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(2, %r10, %r11, %ymm13); OCB_INPUT(3, %r12, %r13, %ymm12); movq (8 * 8)(%r8), %r10; movq (9 * 8)(%r8), %r11; movq (10 * 8)(%r8), %r12; movq (11 * 8)(%r8), %r13; OCB_INPUT(4, %r10, %r11, %ymm11); OCB_INPUT(5, %r12, %r13, %ymm10); movq (12 * 8)(%r8), %r10; movq (13 * 8)(%r8), %r11; movq (14 * 8)(%r8), %r12; movq (15 * 8)(%r8), %r13; OCB_INPUT(6, %r10, %r11, %ymm9); OCB_INPUT(7, %r12, %r13, %ymm8); movq (16 * 8)(%r8), %r10; movq (17 * 8)(%r8), %r11; movq (18 * 8)(%r8), %r12; movq (19 * 8)(%r8), %r13; OCB_INPUT(8, %r10, %r11, %ymm7); OCB_INPUT(9, %r12, %r13, %ymm6); movq (20 * 8)(%r8), %r10; movq (21 * 8)(%r8), %r11; movq (22 * 8)(%r8), %r12; movq (23 * 8)(%r8), %r13; OCB_INPUT(10, %r10, %r11, %ymm5); OCB_INPUT(11, %r12, %r13, %ymm4); movq (24 * 8)(%r8), %r10; movq (25 * 8)(%r8), %r11; movq (26 * 8)(%r8), %r12; movq (27 * 8)(%r8), %r13; OCB_INPUT(12, %r10, %r11, %ymm3); OCB_INPUT(13, %r12, %r13, %ymm2); movq (28 * 8)(%r8), %r10; movq (29 * 8)(%r8), %r11; movq (30 * 8)(%r8), %r12; movq (31 * 8)(%r8), %r13; OCB_INPUT(14, %r10, %r11, %ymm1); OCB_INPUT(15, %r12, %r13, %ymm0); #undef OCB_INPUT vmovdqu %xmm14, (%rdx); cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %r10d; cmovel %r10d, %r8d; /* max */ movq %rcx, %r10; /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; vpxor %ymm3, %ymm15, %ymm3; vpxor %ymm4, %ymm15, %ymm4; vpxor %ymm5, %ymm15, %ymm5; vpxor %ymm6, %ymm15, %ymm6; vpxor %ymm7, %ymm15, %ymm7; vpxor %ymm8, %ymm15, %ymm8; vpxor %ymm9, %ymm15, %ymm9; vpxor %ymm10, %ymm15, %ymm10; vpxor %ymm11, %ymm15, %ymm11; vpxor %ymm12, %ymm15, %ymm12; vpxor %ymm13, %ymm15, %ymm13; vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; call __camellia_enc_blk32; vpxor %ymm7, %ymm6, %ymm6; vpxor %ymm5, %ymm4, %ymm4; vpxor %ymm3, %ymm2, %ymm2; vpxor %ymm1, %ymm0, %ymm0; vpxor %ymm15, %ymm14, %ymm14; vpxor %ymm13, %ymm12, %ymm12; vpxor %ymm11, %ymm10, %ymm10; vpxor %ymm9, %ymm8, %ymm8; vpxor %ymm6, %ymm4, %ymm4; vpxor %ymm2, %ymm0, %ymm0; vpxor %ymm14, %ymm12, %ymm12; vpxor %ymm10, %ymm8, %ymm8; vpxor %ymm4, %ymm0, %ymm0; vpxor %ymm12, %ymm8, %ymm8; vpxor %ymm0, %ymm8, %ymm0; vextracti128 $1, %ymm0, %xmm1; vpxor (%r10), %xmm0, %xmm0; vpxor %xmm0, %xmm1, %xmm0; vmovdqu %xmm0, (%r10); vzeroall; movq (16 * 32 + 0 * 8)(%rsp), %r10; movq (16 * 32 + 1 * 8)(%rsp), %r11; movq (16 * 32 + 2 * 8)(%rsp), %r12; movq (16 * 32 + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);) #endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */ diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S index 82f67890..a804654c 100644 --- a/cipher/cast5-amd64.S +++ b/cipher/cast5-amd64.S @@ -1,663 +1,663 @@ /* cast5-amd64.S - AMD64 assembly implementation of CAST5 cipher * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5) #include "asm-common-amd64.h" .text .extern _gcry_cast5_s1to4; #define s1 0 #define s2 (s1 + (4 * 256)) #define s3 (s2 + (4 * 256)) #define s4 (s3 + (4 * 256)) /* structure of CAST5_context: */ #define Km 0 #define Kr (Km + (16 * 4)) /* register macros */ #define CTX %rdi #define RIO %rsi #define RTAB %r8 #define RLR0 %r9 #define RLR1 %r10 #define RLR2 %r11 #define RLR3 %r12 #define RLR0d %r9d #define RLR1d %r10d #define RLR2d %r11d #define RLR3d %r12d #define RX0 %rax #define RX1 %rbx #define RX2 %rdx #define RX0d %eax #define RX1d %ebx #define RX2d %edx #define RX0bl %al #define RX1bl %bl #define RX2bl %dl #define RX0bh %ah #define RX1bh %bh #define RX2bh %dh #define RKR %rcx #define RKRd %ecx #define RKRbl %cl #define RT0 %rbp #define RT1 %rsi #define RT0d %ebp #define RT1d %esi #define RKM0d %r13d #define RKM1d %r14d /*********************************************************************** * 1-way cast5 ***********************************************************************/ #define dummy(x) #define shr_kr(none) \ shrq $8, RKR; #define F(km, load_next_kr, op0, op1, op2, op3) \ op0 ## l RLR0d, km ## d; \ roll RKRbl, km ## d; \ rorq $32, RLR0; \ movzbl km ## bh, RT0d; \ movzbl km ## bl, RT1d; \ roll $16, km ## d; \ movl s1(RTAB,RT0,4), RT0d; \ op1 ## l s2(RTAB,RT1,4), RT0d; \ load_next_kr(kr_next); \ movzbl km ## bh, RT1d; \ movzbl km ## bl, km ## d; \ op2 ## l s3(RTAB,RT1,4), RT0d; \ op3 ## l s4(RTAB,km,4), RT0d; \ xorq RT0, RLR0; #define F1(km, load_next_kr) \ F(##km, load_next_kr, add, xor, sub, add) #define F2(km, load_next_kr) \ F(##km, load_next_kr, xor, sub, add, xor) #define F3(km, load_next_kr) \ F(##km, load_next_kr, sub, add, xor, sub) #define get_round_km(n, km) \ movl Km+4*(n)(CTX), km; #define get_round_kr_enc(n) \ movq $0x1010101010101010, RKR; \ \ /* merge rorl rk and rorl $16 */ \ xorq Kr+(n)(CTX), RKR; #define get_round_kr_dec(n) \ movq $0x1010101010101010, RKR; \ \ /* merge rorl rk and rorl $16 */ \ xorq Kr+(n - 7)(CTX), RKR; \ bswapq RKR; #define round_enc(n, FA, FB, fn1, fn2) \ get_round_km(n + 1, RX2d); \ FA(RX0, fn1); \ get_round_km(n + 2, RX0d); \ FB(RX2, fn2); #define round_enc_last(n, FXA, FXB) \ get_round_km(n + 1, RX2d); \ \ FXA(RX0, shr_kr); \ FXB(RX2, dummy); #define round_enc_1(n, FA, FB) \ round_enc(n, FA, FB, shr_kr, shr_kr) #define round_enc_2(n, FA, FB) \ round_enc(n, FA, FB, shr_kr, dummy) #define round_dec(n, FA, FB, fn1, fn2) \ get_round_km(n - 1, RX2d); \ FA(RX0, fn1); \ get_round_km(n - 2, RX0d); \ FB(RX2, fn2); #define round_dec_last(n, FXA, FXB) \ get_round_km(n - 1, RX2d); \ FXA(RX0, shr_kr); \ FXB(RX2, dummy); #define round_dec_1(n, FA, FB) \ round_dec(n, FA, FB, shr_kr, shr_kr) #define round_dec_2(n, FA, FB) \ round_dec(n, FA, FB, shr_kr, dummy) #define read_block() \ movq (RIO), RLR0; \ bswapq RLR0; #define write_block() \ bswapq RLR0; \ rorq $32, RLR0; \ movq RLR0, (RIO); .align 8 .globl _gcry_cast5_amd64_encrypt_block ELF(.type _gcry_cast5_amd64_encrypt_block,@function;) _gcry_cast5_amd64_encrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); movq %rsi, %r10; GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); movq %rdx, RIO; read_block(); get_round_km(0, RX0d); get_round_kr_enc(0); round_enc_1(0, F1, F2); round_enc_1(2, F3, F1); round_enc_1(4, F2, F3); round_enc_2(6, F1, F2); get_round_kr_enc(8); round_enc_1(8, F3, F1); round_enc_1(10, F2, F3); round_enc_1(12, F1, F2); round_enc_last(14, F3, F1); movq %r10, RIO; write_block(); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;) .align 8 .globl _gcry_cast5_amd64_decrypt_block ELF(.type _gcry_cast5_amd64_decrypt_block,@function;) _gcry_cast5_amd64_decrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); movq %rsi, %r10; GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); movq %rdx, RIO; read_block(); get_round_km(15, RX0d); get_round_kr_dec(15); round_dec_1(15, F1, F3); round_dec_1(13, F2, F1); round_dec_1(11, F3, F2); round_dec_2(9, F1, F3); get_round_kr_dec(7); round_dec_1(7, F2, F1); round_dec_1(5, F3, F2); round_dec_1(3, F1, F3); round_dec_last(1, F2, F1); movq %r10, RIO; write_block(); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;) /********************************************************************** 4-way cast5, four blocks parallel **********************************************************************/ #define F_tail(rlr, rx, op1, op2, op3) \ movzbl rx ## bh, RT0d; \ movzbl rx ## bl, RT1d; \ roll $16, rx ## d; \ movl s1(RTAB,RT0,4), RT0d; \ op1 ## l s2(RTAB,RT1,4), RT0d; \ movzbl rx ## bh, RT1d; \ movzbl rx ## bl, rx ## d; \ op2 ## l s3(RTAB,RT1,4), RT0d; \ op3 ## l s4(RTAB,rx,4), RT0d; \ xorq RT0, rlr; #define F4(km, load_next_kr, op0, op1, op2, op3) \ movl km, RX0d; \ op0 ## l RLR0d, RX0d; \ roll RKRbl, RX0d; \ rorq $32, RLR0; \ \ movl km, RX1d; \ op0 ## l RLR1d, RX1d; \ roll RKRbl, RX1d; \ rorq $32, RLR1; \ \ movl km, RX2d; \ op0 ## l RLR2d, RX2d; \ roll RKRbl, RX2d; \ rorq $32, RLR2; \ \ F_tail(RLR0, RX0, op1, op2, op3); \ F_tail(RLR1, RX1, op1, op2, op3); \ F_tail(RLR2, RX2, op1, op2, op3); \ \ movl km, RX0d; \ op0 ## l RLR3d, RX0d; \ roll RKRbl, RX0d; \ load_next_kr(); \ rorq $32, RLR3; \ \ F_tail(RLR3, RX0, op1, op2, op3); #define F4_1(km, load_next_kr) \ F4(km, load_next_kr, add, xor, sub, add) #define F4_2(km, load_next_kr) \ F4(km, load_next_kr, xor, sub, add, xor) #define F4_3(km, load_next_kr) \ F4(km, load_next_kr, sub, add, xor, sub) #define round_enc4(n, FA, FB, fn1, fn2) \ get_round_km(n + 1, RKM1d); \ FA(RKM0d, fn1); \ get_round_km(n + 2, RKM0d); \ FB(RKM1d, fn2); #define round_enc_last4(n, FXA, FXB) \ get_round_km(n + 1, RKM1d); \ FXA(RKM0d, shr_kr); \ FXB(RKM1d, dummy); #define round_enc4_1(n, FA, FB) \ round_enc4(n, FA, FB, shr_kr, shr_kr); #define round_enc4_2(n, FA, FB) \ round_enc4(n, FA, FB, shr_kr, dummy); #define round_dec4(n, FA, FB, fn1, fn2) \ get_round_km(n - 1, RKM1d); \ FA(RKM0d, fn1); \ get_round_km(n - 2, RKM0d); \ FB(RKM1d, fn2); #define round_dec_last4(n, FXA, FXB) \ get_round_km(n - 1, RKM1d); \ FXA(RKM0d, shr_kr); \ FXB(RKM1d, dummy); #define round_dec4_1(n, FA, FB) \ round_dec4(n, FA, FB, shr_kr, shr_kr); #define round_dec4_2(n, FA, FB) \ round_dec4(n, FA, FB, shr_kr, dummy); #define inbswap_block4(a, b, c, d) \ bswapq a; \ bswapq b; \ bswapq c; \ bswapq d; #define outbswap_block4(a, b, c, d) \ bswapq a; \ bswapq b; \ bswapq c; \ bswapq d; \ rorq $32, a; \ rorq $32, b; \ rorq $32, c; \ rorq $32, d; .align 8 ELF(.type __cast5_enc_blk4,@function;) __cast5_enc_blk4: /* input: * %rdi: ctx, CTX * RLR0,RLR1,RLR2,RLR3: four input plaintext blocks * output: * RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks */ CFI_STARTPROC(); GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); get_round_km(0, RKM0d); get_round_kr_enc(0); round_enc4_1(0, F4_1, F4_2); round_enc4_1(2, F4_3, F4_1); round_enc4_1(4, F4_2, F4_3); round_enc4_2(6, F4_1, F4_2); get_round_kr_enc(8); round_enc4_1(8, F4_3, F4_1); round_enc4_1(10, F4_2, F4_3); round_enc4_1(12, F4_1, F4_2); round_enc_last4(14, F4_3, F4_1); outbswap_block4(RLR0, RLR1, RLR2, RLR3); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;) .align 8 ELF(.type __cast5_dec_blk4,@function;) __cast5_dec_blk4: /* input: * %rdi: ctx, CTX * RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks * output: * RLR0,RLR1,RLR2,RLR3: four output plaintext blocks */ CFI_STARTPROC(); GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); inbswap_block4(RLR0, RLR1, RLR2, RLR3); get_round_km(15, RKM0d); get_round_kr_dec(15); round_dec4_1(15, F4_1, F4_3); round_dec4_1(13, F4_2, F4_1); round_dec4_1(11, F4_3, F4_2); round_dec4_2(9, F4_1, F4_3); get_round_kr_dec(7); round_dec4_1(7, F4_2, F4_1); round_dec4_1(5, F4_3, F4_2); round_dec4_1(3, F4_1, F4_3); round_dec_last4(1, F4_2, F4_1); outbswap_block4(RLR0, RLR1, RLR2, RLR3); CFI_ENDPROC(); - ret; + ret_spec_stop; ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;) .align 8 .globl _gcry_cast5_amd64_ctr_enc ELF(.type _gcry_cast5_amd64_ctr_enc,@function;) _gcry_cast5_amd64_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (big endian, 64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); pushq %r14; CFI_PUSH(%r14); pushq %rsi; CFI_PUSH(%rsi); pushq %rdx; CFI_PUSH(%rdx); /* load IV and byteswap */ movq (%rcx), RX0; bswapq RX0; movq RX0, RLR0; /* construct IVs */ leaq 1(RX0), RLR1; leaq 2(RX0), RLR2; leaq 3(RX0), RLR3; leaq 4(RX0), RX0; bswapq RX0; /* store new IV */ movq RX0, (%rcx); call __cast5_enc_blk4; popq %r14; /*src*/ CFI_POP_TMP_REG(); popq %r13; /*dst*/ CFI_POP_TMP_REG(); /* XOR key-stream with plaintext */ xorq 0 * 8(%r14), RLR0; xorq 1 * 8(%r14), RLR1; xorq 2 * 8(%r14), RLR2; xorq 3 * 8(%r14), RLR3; movq RLR0, 0 * 8(%r13); movq RLR1, 1 * 8(%r13); movq RLR2, 2 * 8(%r13); movq RLR3, 3 * 8(%r13); popq %r14; CFI_POP(%r14); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;) .align 8 .globl _gcry_cast5_amd64_cbc_dec ELF(.type _gcry_cast5_amd64_cbc_dec,@function;) _gcry_cast5_amd64_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); pushq %r14; CFI_PUSH(%r14); pushq %rcx; CFI_PUSH(%rcx); pushq %rsi; CFI_PUSH(%rsi); pushq %rdx; CFI_PUSH(%rdx); /* load input */ movq 0 * 8(%rdx), RLR0; movq 1 * 8(%rdx), RLR1; movq 2 * 8(%rdx), RLR2; movq 3 * 8(%rdx), RLR3; call __cast5_dec_blk4; popq RX0; /*src*/ CFI_POP_TMP_REG(); popq RX1; /*dst*/ CFI_POP_TMP_REG(); popq RX2; /*iv*/ CFI_POP_TMP_REG(); movq 3 * 8(RX0), %r14; xorq (RX2), RLR0; xorq 0 * 8(RX0), RLR1; xorq 1 * 8(RX0), RLR2; xorq 2 * 8(RX0), RLR3; movq %r14, (RX2); /* store new IV */ movq RLR0, 0 * 8(RX1); movq RLR1, 1 * 8(RX1); movq RLR2, 2 * 8(RX1); movq RLR3, 3 * 8(RX1); popq %r14; CFI_POP(%r14); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;) .align 8 .globl _gcry_cast5_amd64_cfb_dec ELF(.type _gcry_cast5_amd64_cfb_dec,@function;) _gcry_cast5_amd64_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); pushq %r14; CFI_PUSH(%r14); pushq %rsi; CFI_PUSH(%rsi); pushq %rdx; CFI_PUSH(%rdx); /* Load input */ movq (%rcx), RLR0; movq 0 * 8(%rdx), RLR1; movq 1 * 8(%rdx), RLR2; movq 2 * 8(%rdx), RLR3; inbswap_block4(RLR0, RLR1, RLR2, RLR3); /* Update IV */ movq 3 * 8(%rdx), %rdx; movq %rdx, (%rcx); call __cast5_enc_blk4; popq %rdx; /*src*/ CFI_POP_TMP_REG(); popq %rcx; /*dst*/ CFI_POP_TMP_REG(); xorq 0 * 8(%rdx), RLR0; xorq 1 * 8(%rdx), RLR1; xorq 2 * 8(%rdx), RLR2; xorq 3 * 8(%rdx), RLR3; movq RLR0, 0 * 8(%rcx); movq RLR1, 1 * 8(%rcx); movq RLR2, 2 * 8(%rcx); movq RLR3, 3 * 8(%rcx); popq %r14; CFI_POP(%r14); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;) #endif /*defined(USE_CAST5)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S index 51e107be..9f2a036a 100644 --- a/cipher/chacha20-amd64-avx2.S +++ b/cipher/chacha20-amd64-avx2.S @@ -1,601 +1,601 @@ /* chacha20-amd64-avx2.S - AVX2 implementation of ChaCha20 cipher * * Copyright (C) 2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_AVX2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) .text #include "asm-common-amd64.h" #include "asm-poly1305-amd64.h" /* register macros */ #define INPUT %rdi #define DST %rsi #define SRC %rdx #define NBLKS %rcx #define ROUND %eax /* stack structure */ #define STACK_VEC_X12 (32) #define STACK_VEC_X13 (32 + STACK_VEC_X12) #define STACK_TMP (32 + STACK_VEC_X13) #define STACK_TMP1 (32 + STACK_TMP) #define STACK_MAX (32 + STACK_TMP1) /* vector registers */ #define X0 %ymm0 #define X1 %ymm1 #define X2 %ymm2 #define X3 %ymm3 #define X4 %ymm4 #define X5 %ymm5 #define X6 %ymm6 #define X7 %ymm7 #define X8 %ymm8 #define X9 %ymm9 #define X10 %ymm10 #define X11 %ymm11 #define X12 %ymm12 #define X13 %ymm13 #define X14 %ymm14 #define X15 %ymm15 #define X0h %xmm0 #define X1h %xmm1 #define X2h %xmm2 #define X3h %xmm3 #define X4h %xmm4 #define X5h %xmm5 #define X6h %xmm6 #define X7h %xmm7 #define X8h %xmm8 #define X9h %xmm9 #define X10h %xmm10 #define X11h %xmm11 #define X12h %xmm12 #define X13h %xmm13 #define X14h %xmm14 #define X15h %xmm15 /********************************************************************** helper macros **********************************************************************/ /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0,x1,x2,x3,t1,t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; /* 2x2 128-bit matrix transpose */ #define transpose_16byte_2x2(x0,x1,t1) \ vmovdqa x0, t1; \ vperm2i128 $0x20, x1, x0, x0; \ vperm2i128 $0x31, x1, t1, x1; /* xor register with unaligned src and save to unaligned dst */ #define xor_src_dst(dst, src, offset, xreg) \ vpxor offset(src), xreg, xreg; \ vmovdqu xreg, offset(dst); /********************************************************************** 8-way chacha20 **********************************************************************/ #define ROTATE2(v1,v2,c,tmp) \ vpsrld $(32 - (c)), v1, tmp; \ vpslld $(c), v1, v1; \ vpaddb tmp, v1, v1; \ vpsrld $(32 - (c)), v2, tmp; \ vpslld $(c), v2, v2; \ vpaddb tmp, v2, v2; #define ROTATE_SHUF_2(v1,v2,shuf) \ vpshufb shuf, v1, v1; \ vpshufb shuf, v2, v2; #define XOR(ds,s) \ vpxor s, ds, ds; #define PLUS(ds,s) \ vpaddd s, ds, ds; #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\ interleave_op1,interleave_op2,\ interleave_op3,interleave_op4) \ vbroadcasti128 .Lshuf_rol16 rRIP, tmp1; \ interleave_op1; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ interleave_op2; \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 12, tmp1); \ vbroadcasti128 .Lshuf_rol8 rRIP, tmp1; \ interleave_op3; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ interleave_op4; \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7, tmp1); .align 32 chacha20_data: .Lshuf_rol16: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .Lshuf_rol8: .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 .Linc_counter: .byte 0,1,2,3,4,5,6,7 .Lunsigned_cmp: .long 0x80000000 .align 8 .globl _gcry_chacha20_amd64_avx2_blocks8 ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;) _gcry_chacha20_amd64_avx2_blocks8: /* input: * %rdi: input * %rsi: dst * %rdx: src * %rcx: nblks (multiple of 8) */ CFI_STARTPROC(); vzeroupper; pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $STACK_MAX, %rsp; andq $~31, %rsp; .Loop8: mov $20, ROUND; /* Construct counter vectors X12 and X13 */ vpmovzxbd .Linc_counter rRIP, X0; vpbroadcastd .Lunsigned_cmp rRIP, X2; vpbroadcastd (12 * 4)(INPUT), X12; vpbroadcastd (13 * 4)(INPUT), X13; vpaddd X0, X12, X12; vpxor X2, X0, X0; vpxor X2, X12, X1; vpcmpgtd X1, X0, X0; vpsubd X0, X13, X13; vmovdqa X12, (STACK_VEC_X12)(%rsp); vmovdqa X13, (STACK_VEC_X13)(%rsp); /* Load vectors */ vpbroadcastd (0 * 4)(INPUT), X0; vpbroadcastd (1 * 4)(INPUT), X1; vpbroadcastd (2 * 4)(INPUT), X2; vpbroadcastd (3 * 4)(INPUT), X3; vpbroadcastd (4 * 4)(INPUT), X4; vpbroadcastd (5 * 4)(INPUT), X5; vpbroadcastd (6 * 4)(INPUT), X6; vpbroadcastd (7 * 4)(INPUT), X7; vpbroadcastd (8 * 4)(INPUT), X8; vpbroadcastd (9 * 4)(INPUT), X9; vpbroadcastd (10 * 4)(INPUT), X10; vpbroadcastd (11 * 4)(INPUT), X11; vpbroadcastd (14 * 4)(INPUT), X14; vpbroadcastd (15 * 4)(INPUT), X15; vmovdqa X15, (STACK_TMP)(%rsp); .Lround2: QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,,,,) vmovdqa (STACK_TMP)(%rsp), X15; vmovdqa X8, (STACK_TMP)(%rsp); QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,,,,) QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,,,,) vmovdqa (STACK_TMP)(%rsp), X8; vmovdqa X15, (STACK_TMP)(%rsp); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,,,,) sub $2, ROUND; jnz .Lround2; vmovdqa X8, (STACK_TMP1)(%rsp); /* tmp := X15 */ vpbroadcastd (0 * 4)(INPUT), X15; PLUS(X0, X15); vpbroadcastd (1 * 4)(INPUT), X15; PLUS(X1, X15); vpbroadcastd (2 * 4)(INPUT), X15; PLUS(X2, X15); vpbroadcastd (3 * 4)(INPUT), X15; PLUS(X3, X15); vpbroadcastd (4 * 4)(INPUT), X15; PLUS(X4, X15); vpbroadcastd (5 * 4)(INPUT), X15; PLUS(X5, X15); vpbroadcastd (6 * 4)(INPUT), X15; PLUS(X6, X15); vpbroadcastd (7 * 4)(INPUT), X15; PLUS(X7, X15); transpose_4x4(X0, X1, X2, X3, X8, X15); transpose_4x4(X4, X5, X6, X7, X8, X15); vmovdqa (STACK_TMP1)(%rsp), X8; transpose_16byte_2x2(X0, X4, X15); transpose_16byte_2x2(X1, X5, X15); transpose_16byte_2x2(X2, X6, X15); transpose_16byte_2x2(X3, X7, X15); vmovdqa (STACK_TMP)(%rsp), X15; xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1); vpbroadcastd (8 * 4)(INPUT), X0; PLUS(X8, X0); vpbroadcastd (9 * 4)(INPUT), X0; PLUS(X9, X0); vpbroadcastd (10 * 4)(INPUT), X0; PLUS(X10, X0); vpbroadcastd (11 * 4)(INPUT), X0; PLUS(X11, X0); vmovdqa (STACK_VEC_X12)(%rsp), X0; PLUS(X12, X0); vmovdqa (STACK_VEC_X13)(%rsp), X0; PLUS(X13, X0); vpbroadcastd (14 * 4)(INPUT), X0; PLUS(X14, X0); vpbroadcastd (15 * 4)(INPUT), X0; PLUS(X15, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2); xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3); /* Update counter */ addq $8, (12 * 4)(INPUT); transpose_4x4(X8, X9, X10, X11, X0, X1); transpose_4x4(X12, X13, X14, X15, X0, X1); xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4); xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5); transpose_16byte_2x2(X8, X12, X0); transpose_16byte_2x2(X9, X13, X0); transpose_16byte_2x2(X10, X14, X0); transpose_16byte_2x2(X11, X15, X0); xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6); xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7); xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8); xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9); xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10); xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11); xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12); xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13); xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14); xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15); sub $8, NBLKS; lea (8 * 64)(DST), DST; lea (8 * 64)(SRC), SRC; jnz .Loop8; /* clear the used vector registers and stack */ vpxor X0, X0, X0; vmovdqa X0, (STACK_VEC_X12)(%rsp); vmovdqa X0, (STACK_VEC_X13)(%rsp); vmovdqa X0, (STACK_TMP)(%rsp); vmovdqa X0, (STACK_TMP1)(%rsp); vzeroall; /* eax zeroed by round loop. */ leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_avx2_blocks8, .-_gcry_chacha20_amd64_avx2_blocks8;) /********************************************************************** 8-way stitched chacha20-poly1305 **********************************************************************/ #define _ /*_*/ .align 8 .globl _gcry_chacha20_poly1305_amd64_avx2_blocks8 ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8,@function;) _gcry_chacha20_poly1305_amd64_avx2_blocks8: /* input: * %rdi: input * %rsi: dst * %rdx: src * %rcx: nblks (multiple of 8) * %r9: poly1305-state * %r8: poly1305-src */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; subq $(9 * 8) + STACK_MAX + 32, %rsp; andq $~31, %rsp; movq %rbx, (STACK_MAX + 0 * 8)(%rsp); movq %r12, (STACK_MAX + 1 * 8)(%rsp); movq %r13, (STACK_MAX + 2 * 8)(%rsp); movq %r14, (STACK_MAX + 3 * 8)(%rsp); movq %r15, (STACK_MAX + 4 * 8)(%rsp); CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8); CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8); CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8); CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8); CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8); movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS /* Load state */ POLY1305_LOAD_STATE(); .Loop_poly8: /* Construct counter vectors X12 and X13 */ vpmovzxbd .Linc_counter rRIP, X0; vpbroadcastd .Lunsigned_cmp rRIP, X2; vpbroadcastd (12 * 4)(INPUT), X12; vpbroadcastd (13 * 4)(INPUT), X13; vpaddd X0, X12, X12; vpxor X2, X0, X0; vpxor X2, X12, X1; vpcmpgtd X1, X0, X0; vpsubd X0, X13, X13; vmovdqa X12, (STACK_VEC_X12)(%rsp); vmovdqa X13, (STACK_VEC_X13)(%rsp); /* Load vectors */ vpbroadcastd (0 * 4)(INPUT), X0; vpbroadcastd (1 * 4)(INPUT), X1; vpbroadcastd (2 * 4)(INPUT), X2; vpbroadcastd (3 * 4)(INPUT), X3; vpbroadcastd (4 * 4)(INPUT), X4; vpbroadcastd (5 * 4)(INPUT), X5; vpbroadcastd (6 * 4)(INPUT), X6; vpbroadcastd (7 * 4)(INPUT), X7; vpbroadcastd (8 * 4)(INPUT), X8; vpbroadcastd (9 * 4)(INPUT), X9; vpbroadcastd (10 * 4)(INPUT), X10; vpbroadcastd (11 * 4)(INPUT), X11; vpbroadcastd (14 * 4)(INPUT), X14; vpbroadcastd (15 * 4)(INPUT), X15; vmovdqa X15, (STACK_TMP)(%rsp); /* Process eight ChaCha20 blocks and 32 Poly1305 blocks. */ movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp); .Lround8_with_poly1305_outer: movl $6, (STACK_MAX + 8 * 8)(%rsp); .Lround8_with_poly1305_inner1: /* rounds 0-5 & 10-15 */ POLY1305_BLOCK_PART1(0 * 16) QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, POLY1305_BLOCK_PART2(), POLY1305_BLOCK_PART3(), POLY1305_BLOCK_PART4(), POLY1305_BLOCK_PART5()) vmovdqa (STACK_TMP)(%rsp), X15; vmovdqa X8, (STACK_TMP)(%rsp); POLY1305_BLOCK_PART1(1 * 16) QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, POLY1305_BLOCK_PART2(), POLY1305_BLOCK_PART3(), POLY1305_BLOCK_PART4(), POLY1305_BLOCK_PART5()) POLY1305_BLOCK_PART1(2 * 16) QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, POLY1305_BLOCK_PART2(), POLY1305_BLOCK_PART3(), POLY1305_BLOCK_PART4(), POLY1305_BLOCK_PART5()) vmovdqa (STACK_TMP)(%rsp), X8; vmovdqa X15, (STACK_TMP)(%rsp); POLY1305_BLOCK_PART1(3 * 16) lea (4 * 16)(POLY_RSRC), POLY_RSRC; QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, POLY1305_BLOCK_PART2(), POLY1305_BLOCK_PART3(), POLY1305_BLOCK_PART4(), POLY1305_BLOCK_PART5()) subl $2, (STACK_MAX + 8 * 8)(%rsp); jnz .Lround8_with_poly1305_inner1; movl $4, (STACK_MAX + 8 * 8)(%rsp); .Lround8_with_poly1305_inner2: /* rounds 6-9 & 16-19 */ POLY1305_BLOCK_PART1(0 * 16) QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15, POLY1305_BLOCK_PART2(), _, POLY1305_BLOCK_PART3(), _) vmovdqa (STACK_TMP)(%rsp), X15; vmovdqa X8, (STACK_TMP)(%rsp); QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8, _, POLY1305_BLOCK_PART4(), _, POLY1305_BLOCK_PART5()) POLY1305_BLOCK_PART1(1 * 16); lea (2 * 16)(POLY_RSRC), POLY_RSRC; QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8, _, POLY1305_BLOCK_PART2(), _, POLY1305_BLOCK_PART3()) vmovdqa (STACK_TMP)(%rsp), X8; vmovdqa X15, (STACK_TMP)(%rsp); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15, POLY1305_BLOCK_PART4(), _, POLY1305_BLOCK_PART5(), _) subl $2, (STACK_MAX + 8 * 8)(%rsp); jnz .Lround8_with_poly1305_inner2; subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp); jnz .Lround8_with_poly1305_outer; movq (STACK_MAX + 5 * 8)(%rsp), SRC; movq (STACK_MAX + 6 * 8)(%rsp), DST; vmovdqa X8, (STACK_TMP1)(%rsp); /* tmp := X15 */ vpbroadcastd (0 * 4)(INPUT), X15; PLUS(X0, X15); vpbroadcastd (1 * 4)(INPUT), X15; PLUS(X1, X15); vpbroadcastd (2 * 4)(INPUT), X15; PLUS(X2, X15); vpbroadcastd (3 * 4)(INPUT), X15; PLUS(X3, X15); vpbroadcastd (4 * 4)(INPUT), X15; PLUS(X4, X15); vpbroadcastd (5 * 4)(INPUT), X15; PLUS(X5, X15); vpbroadcastd (6 * 4)(INPUT), X15; PLUS(X6, X15); vpbroadcastd (7 * 4)(INPUT), X15; PLUS(X7, X15); transpose_4x4(X0, X1, X2, X3, X8, X15); transpose_4x4(X4, X5, X6, X7, X8, X15); vmovdqa (STACK_TMP1)(%rsp), X8; transpose_16byte_2x2(X0, X4, X15); transpose_16byte_2x2(X1, X5, X15); transpose_16byte_2x2(X2, X6, X15); transpose_16byte_2x2(X3, X7, X15); vmovdqa (STACK_TMP)(%rsp), X15; xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1); vpbroadcastd (8 * 4)(INPUT), X0; PLUS(X8, X0); vpbroadcastd (9 * 4)(INPUT), X0; PLUS(X9, X0); vpbroadcastd (10 * 4)(INPUT), X0; PLUS(X10, X0); vpbroadcastd (11 * 4)(INPUT), X0; PLUS(X11, X0); vmovdqa (STACK_VEC_X12)(%rsp), X0; PLUS(X12, X0); vmovdqa (STACK_VEC_X13)(%rsp), X0; PLUS(X13, X0); vpbroadcastd (14 * 4)(INPUT), X0; PLUS(X14, X0); vpbroadcastd (15 * 4)(INPUT), X0; PLUS(X15, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2); xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3); /* Update counter */ addq $8, (12 * 4)(INPUT); transpose_4x4(X8, X9, X10, X11, X0, X1); transpose_4x4(X12, X13, X14, X15, X0, X1); xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4); xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5); transpose_16byte_2x2(X8, X12, X0); transpose_16byte_2x2(X9, X13, X0); transpose_16byte_2x2(X10, X14, X0); transpose_16byte_2x2(X11, X15, X0); xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6); xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7); xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8); xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9); xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10); xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11); xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12); xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13); xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14); xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15); subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS lea (8 * 64)(DST), DST; lea (8 * 64)(SRC), SRC; movq SRC, (STACK_MAX + 5 * 8)(%rsp); movq DST, (STACK_MAX + 6 * 8)(%rsp); jnz .Loop_poly8; /* Store state */ POLY1305_STORE_STATE(); /* clear the used vector registers and stack */ vpxor X0, X0, X0; vmovdqa X0, (STACK_VEC_X12)(%rsp); vmovdqa X0, (STACK_VEC_X13)(%rsp); vmovdqa X0, (STACK_TMP)(%rsp); vmovdqa X0, (STACK_TMP1)(%rsp); vzeroall; movq (STACK_MAX + 0 * 8)(%rsp), %rbx; movq (STACK_MAX + 1 * 8)(%rsp), %r12; movq (STACK_MAX + 2 * 8)(%rsp), %r13; movq (STACK_MAX + 3 * 8)(%rsp), %r14; movq (STACK_MAX + 4 * 8)(%rsp), %r15; CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); xorl %eax, %eax; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8, .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;) #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index 9cdb69ae..6c737978 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -1,1012 +1,1012 @@ /* chacha20-amd64-ssse3.S - SSSE3 implementation of ChaCha20 cipher * * Copyright (C) 2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) .text #include "asm-common-amd64.h" #include "asm-poly1305-amd64.h" /* register macros */ #define INPUT %rdi #define DST %rsi #define SRC %rdx #define NBLKS %rcx #define ROUND %eax /* stack structure */ #define STACK_VEC_X12 (16) #define STACK_VEC_X13 (16 + STACK_VEC_X12) #define STACK_TMP (16 + STACK_VEC_X13) #define STACK_TMP1 (16 + STACK_TMP) #define STACK_TMP2 (16 + STACK_TMP1) #define STACK_MAX (16 + STACK_TMP2) /* vector registers */ #define X0 %xmm0 #define X1 %xmm1 #define X2 %xmm2 #define X3 %xmm3 #define X4 %xmm4 #define X5 %xmm5 #define X6 %xmm6 #define X7 %xmm7 #define X8 %xmm8 #define X9 %xmm9 #define X10 %xmm10 #define X11 %xmm11 #define X12 %xmm12 #define X13 %xmm13 #define X14 %xmm14 #define X15 %xmm15 /********************************************************************** helper macros **********************************************************************/ /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ movdqa x0, t2; \ punpckhdq x1, t2; \ punpckldq x1, x0; \ \ movdqa x2, t1; \ punpckldq x3, t1; \ punpckhdq x3, x2; \ \ movdqa x0, x1; \ punpckhqdq t1, x1; \ punpcklqdq t1, x0; \ \ movdqa t2, x3; \ punpckhqdq x2, x3; \ punpcklqdq x2, t2; \ movdqa t2, x2; /* fill xmm register with 32-bit value from memory */ #define pbroadcastd(mem32, xreg) \ movd mem32, xreg; \ pshufd $0, xreg, xreg; /* xor with unaligned memory operand */ #define pxor_u(umem128, xreg, t) \ movdqu umem128, t; \ pxor t, xreg; /* xor register with unaligned src and save to unaligned dst */ #define xor_src_dst(dst, src, offset, xreg, t) \ pxor_u(offset(src), xreg, t); \ movdqu xreg, offset(dst); #define clear(x) pxor x,x; /********************************************************************** 4-way chacha20 **********************************************************************/ #define ROTATE2(v1,v2,c,tmp1,tmp2) \ movdqa v1, tmp1; \ movdqa v2, tmp2; \ psrld $(32 - (c)), v1; \ pslld $(c), tmp1; \ paddb tmp1, v1; \ psrld $(32 - (c)), v2; \ pslld $(c), tmp2; \ paddb tmp2, v2; #define ROTATE_SHUF_2(v1,v2,shuf) \ pshufb shuf, v1; \ pshufb shuf, v2; #define XOR(ds,s) \ pxor s, ds; #define PLUS(ds,s) \ paddd s, ds; #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\ interleave_op1,interleave_op2) \ movdqa .Lshuf_rol16 rRIP, tmp1; \ interleave_op1; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 12, tmp1, tmp2); \ movdqa .Lshuf_rol8 rRIP, tmp1; \ interleave_op2; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7, tmp1, tmp2); chacha20_data: .align 16 .Lshuf_rol16: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .Lshuf_rol8: .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 .Lcounter1: .long 1,0,0,0 .Linc_counter: .long 0,1,2,3 .Lunsigned_cmp: .long 0x80000000,0x80000000,0x80000000,0x80000000 .align 8 .globl _gcry_chacha20_amd64_ssse3_blocks4 ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;) _gcry_chacha20_amd64_ssse3_blocks4: /* input: * %rdi: input * %rsi: dst * %rdx: src * %rcx: nblks (multiple of 4) */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $STACK_MAX, %rsp; andq $~15, %rsp; .Loop4: mov $20, ROUND; /* Construct counter vectors X12 and X13 */ movdqa .Linc_counter rRIP, X0; movdqa .Lunsigned_cmp rRIP, X2; pbroadcastd((12 * 4)(INPUT), X12); pbroadcastd((13 * 4)(INPUT), X13); paddd X0, X12; movdqa X12, X1; pxor X2, X0; pxor X2, X1; pcmpgtd X1, X0; psubd X0, X13; movdqa X12, (STACK_VEC_X12)(%rsp); movdqa X13, (STACK_VEC_X13)(%rsp); /* Load vectors */ pbroadcastd((0 * 4)(INPUT), X0); pbroadcastd((1 * 4)(INPUT), X1); pbroadcastd((2 * 4)(INPUT), X2); pbroadcastd((3 * 4)(INPUT), X3); pbroadcastd((4 * 4)(INPUT), X4); pbroadcastd((5 * 4)(INPUT), X5); pbroadcastd((6 * 4)(INPUT), X6); pbroadcastd((7 * 4)(INPUT), X7); pbroadcastd((8 * 4)(INPUT), X8); pbroadcastd((9 * 4)(INPUT), X9); pbroadcastd((10 * 4)(INPUT), X10); pbroadcastd((11 * 4)(INPUT), X11); pbroadcastd((14 * 4)(INPUT), X14); pbroadcastd((15 * 4)(INPUT), X15); movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); .Lround2_4: QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,,) movdqa (STACK_TMP)(%rsp), X11; movdqa (STACK_TMP1)(%rsp), X15; movdqa X8, (STACK_TMP)(%rsp); movdqa X9, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,,) QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,,) movdqa (STACK_TMP)(%rsp), X8; movdqa (STACK_TMP1)(%rsp), X9; movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,,) sub $2, ROUND; jnz .Lround2_4; /* tmp := X15 */ movdqa (STACK_TMP)(%rsp), X11; pbroadcastd((0 * 4)(INPUT), X15); PLUS(X0, X15); pbroadcastd((1 * 4)(INPUT), X15); PLUS(X1, X15); pbroadcastd((2 * 4)(INPUT), X15); PLUS(X2, X15); pbroadcastd((3 * 4)(INPUT), X15); PLUS(X3, X15); pbroadcastd((4 * 4)(INPUT), X15); PLUS(X4, X15); pbroadcastd((5 * 4)(INPUT), X15); PLUS(X5, X15); pbroadcastd((6 * 4)(INPUT), X15); PLUS(X6, X15); pbroadcastd((7 * 4)(INPUT), X15); PLUS(X7, X15); pbroadcastd((8 * 4)(INPUT), X15); PLUS(X8, X15); pbroadcastd((9 * 4)(INPUT), X15); PLUS(X9, X15); pbroadcastd((10 * 4)(INPUT), X15); PLUS(X10, X15); pbroadcastd((11 * 4)(INPUT), X15); PLUS(X11, X15); movdqa (STACK_VEC_X12)(%rsp), X15; PLUS(X12, X15); movdqa (STACK_VEC_X13)(%rsp), X15; PLUS(X13, X15); movdqa X13, (STACK_TMP)(%rsp); pbroadcastd((14 * 4)(INPUT), X15); PLUS(X14, X15); movdqa (STACK_TMP1)(%rsp), X15; movdqa X14, (STACK_TMP1)(%rsp); pbroadcastd((15 * 4)(INPUT), X13); PLUS(X15, X13); movdqa X15, (STACK_TMP2)(%rsp); /* Update counter */ addq $4, (12 * 4)(INPUT); transpose_4x4(X0, X1, X2, X3, X13, X14, X15); xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15); xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15); xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15); xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15); transpose_4x4(X4, X5, X6, X7, X0, X1, X2); movdqa (STACK_TMP)(%rsp), X13; movdqa (STACK_TMP1)(%rsp), X14; movdqa (STACK_TMP2)(%rsp), X15; xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0); transpose_4x4(X8, X9, X10, X11, X0, X1, X2); xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0); transpose_4x4(X12, X13, X14, X15, X0, X1, X2); xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0); sub $4, NBLKS; lea (4 * 64)(DST), DST; lea (4 * 64)(SRC), SRC; jnz .Loop4; /* clear the used vector registers and stack */ clear(X0); movdqa X0, (STACK_VEC_X12)(%rsp); movdqa X0, (STACK_VEC_X13)(%rsp); movdqa X0, (STACK_TMP)(%rsp); movdqa X0, (STACK_TMP1)(%rsp); movdqa X0, (STACK_TMP2)(%rsp); clear(X1); clear(X2); clear(X3); clear(X4); clear(X5); clear(X6); clear(X7); clear(X8); clear(X9); clear(X10); clear(X11); clear(X12); clear(X13); clear(X14); clear(X15); /* eax zeroed by round loop. */ leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, .-_gcry_chacha20_amd64_ssse3_blocks4;) /********************************************************************** 2-way && 1-way chacha20 **********************************************************************/ #define ROTATE_SHUF(v1,shuf) \ pshufb shuf, v1; #define ROTATE(v1,c,tmp1) \ movdqa v1, tmp1; \ psrld $(32 - (c)), v1; \ pslld $(c), tmp1; \ paddb tmp1, v1; #define WORD_SHUF(v1,shuf) \ pshufd $shuf, v1, v1; #define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\ shuf_x2,shuf_x3) \ PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \ PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \ PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \ PLUS(x2, x3); \ WORD_SHUF(x3, shuf_x3); \ XOR(x1, x2); \ WORD_SHUF(x2, shuf_x2); \ ROTATE(x1, 7, tmp1); \ WORD_SHUF(x1, shuf_x1); .align 8 .globl _gcry_chacha20_amd64_ssse3_blocks1 ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;) _gcry_chacha20_amd64_ssse3_blocks1: /* input: * %rdi: input * %rsi: dst * %rdx: src * %rcx: nblks */ CFI_STARTPROC(); /* Load constants */ movdqa .Lcounter1 rRIP, X4; movdqa .Lshuf_rol8 rRIP, X5; movdqa .Lshuf_rol16 rRIP, X6; /* Load state */ movdqu (0 * 4)(INPUT), X10; movdqu (4 * 4)(INPUT), X11; movdqu (8 * 4)(INPUT), X12; movdqu (12 * 4)(INPUT), X13; cmp $2, NBLKS; jb .Loop1; mov $20, ROUND; movdqa X10, X0; movdqa X11, X1; movdqa X12, X2; movdqa X13, X3; movdqa X10, X8; movdqa X11, X9; movdqa X12, X14; movdqa X13, X15; paddq X4, X15; .Lround2_2: QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); sub $2, ROUND; jnz .Lround2_2; PLUS(X0, X10); PLUS(X1, X11); PLUS(X2, X12); PLUS(X3, X13); /* Update counter */ paddq X4, X13; PLUS(X8, X10); PLUS(X9, X11); PLUS(X14, X12); PLUS(X15, X13); /* Update counter */ paddq X4, X13; xor_src_dst(DST, SRC, 0 * 4, X0, X7); xor_src_dst(DST, SRC, 4 * 4, X1, X7); xor_src_dst(DST, SRC, 8 * 4, X2, X7); xor_src_dst(DST, SRC, 12 * 4, X3, X7); xor_src_dst(DST, SRC, 16 * 4, X8, X7); xor_src_dst(DST, SRC, 20 * 4, X9, X7); xor_src_dst(DST, SRC, 24 * 4, X14, X7); xor_src_dst(DST, SRC, 28 * 4, X15, X7); lea (2 * 64)(DST), DST; lea (2 * 64)(SRC), SRC; clear(X8); clear(X9); clear(X14); clear(X15); sub $2, NBLKS; jz .Ldone1; .Loop1: mov $20, ROUND; movdqa X10, X0; movdqa X11, X1; movdqa X12, X2; movdqa X13, X3; .Lround2_1: QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); sub $2, ROUND; jnz .Lround2_1; PLUS(X0, X10); PLUS(X1, X11); PLUS(X2, X12); PLUS(X3, X13); /* Update counter */ paddq X4, X13; xor_src_dst(DST, SRC, 0 * 4, X0, X7); xor_src_dst(DST, SRC, 4 * 4, X1, X7); xor_src_dst(DST, SRC, 8 * 4, X2, X7); xor_src_dst(DST, SRC, 12 * 4, X3, X7); lea (64)(DST), DST; lea (64)(SRC), SRC; sub $1, NBLKS; jnz .Loop1; .Ldone1: /* Store counter */ movdqu X13, (12 * 4)(INPUT); /* clear the used vector registers */ clear(X0); clear(X1); clear(X2); clear(X3); clear(X4); clear(X5); clear(X6); clear(X7); clear(X10); clear(X11); clear(X12); clear(X13); /* eax zeroed by round loop. */ - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_ssse3_blocks1, .-_gcry_chacha20_amd64_ssse3_blocks1;) /********************************************************************** 4-way stitched chacha20-poly1305 **********************************************************************/ #define _ /*_*/ .align 8 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4,@function;) _gcry_chacha20_poly1305_amd64_ssse3_blocks4: /* input: * %rdi: input * %rsi: dst * %rdx: src * %rcx: nblks (multiple of 4) * %r9: poly1305-state * %r8: poly1305-src */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $(9 * 8) + STACK_MAX + 16, %rsp; andq $~15, %rsp; movq %rbx, (STACK_MAX + 0 * 8)(%rsp); movq %r12, (STACK_MAX + 1 * 8)(%rsp); movq %r13, (STACK_MAX + 2 * 8)(%rsp); movq %r14, (STACK_MAX + 3 * 8)(%rsp); movq %r15, (STACK_MAX + 4 * 8)(%rsp); CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8); CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8); CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8); CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8); CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8); movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS /* Load state */ POLY1305_LOAD_STATE(); .Loop_poly4: /* Construct counter vectors X12 and X13 */ movdqa .Linc_counter rRIP, X0; movdqa .Lunsigned_cmp rRIP, X2; pbroadcastd((12 * 4)(INPUT), X12); pbroadcastd((13 * 4)(INPUT), X13); paddd X0, X12; movdqa X12, X1; pxor X2, X0; pxor X2, X1; pcmpgtd X1, X0; psubd X0, X13; movdqa X12, (STACK_VEC_X12)(%rsp); movdqa X13, (STACK_VEC_X13)(%rsp); /* Load vectors */ pbroadcastd((0 * 4)(INPUT), X0); pbroadcastd((1 * 4)(INPUT), X1); pbroadcastd((2 * 4)(INPUT), X2); pbroadcastd((3 * 4)(INPUT), X3); pbroadcastd((4 * 4)(INPUT), X4); pbroadcastd((5 * 4)(INPUT), X5); pbroadcastd((6 * 4)(INPUT), X6); pbroadcastd((7 * 4)(INPUT), X7); pbroadcastd((8 * 4)(INPUT), X8); pbroadcastd((9 * 4)(INPUT), X9); pbroadcastd((10 * 4)(INPUT), X10); pbroadcastd((11 * 4)(INPUT), X11); pbroadcastd((14 * 4)(INPUT), X14); pbroadcastd((15 * 4)(INPUT), X15); movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); /* Process four ChaCha20 blocks and sixteen Poly1305 blocks. */ movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp); .Lround4_with_poly1305_outer: movl $6, (STACK_MAX + 8 * 8)(%rsp); .Lround4_with_poly1305_inner1: /* rounds 0-5 & 10-15 */ POLY1305_BLOCK_PART1(0 * 16) QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, POLY1305_BLOCK_PART2(), POLY1305_BLOCK_PART3()) movdqa (STACK_TMP)(%rsp), X11; movdqa (STACK_TMP1)(%rsp), X15; movdqa X8, (STACK_TMP)(%rsp); movdqa X9, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, POLY1305_BLOCK_PART4(), POLY1305_BLOCK_PART5()) POLY1305_BLOCK_PART1(1 * 16) lea (2 * 16)(POLY_RSRC), POLY_RSRC; QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, POLY1305_BLOCK_PART2(), POLY1305_BLOCK_PART3()) movdqa (STACK_TMP)(%rsp), X8; movdqa (STACK_TMP1)(%rsp), X9; movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, POLY1305_BLOCK_PART4(), POLY1305_BLOCK_PART5()) subl $2, (STACK_MAX + 8 * 8)(%rsp); jnz .Lround4_with_poly1305_inner1; movl $4, (STACK_MAX + 8 * 8)(%rsp); .Lround4_with_poly1305_inner2: /* rounds 6-9 & 16-19 */ POLY1305_BLOCK_PART1(0 * 16) lea (1 * 16)(POLY_RSRC), POLY_RSRC; QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15, POLY1305_BLOCK_PART2(), _) movdqa (STACK_TMP)(%rsp), X11; movdqa (STACK_TMP1)(%rsp), X15; movdqa X8, (STACK_TMP)(%rsp); movdqa X9, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9, POLY1305_BLOCK_PART3(), _) QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9, POLY1305_BLOCK_PART4(), _) movdqa (STACK_TMP)(%rsp), X8; movdqa (STACK_TMP1)(%rsp), X9; movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15, POLY1305_BLOCK_PART5(), _) subl $2, (STACK_MAX + 8 * 8)(%rsp); jnz .Lround4_with_poly1305_inner2; subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp); jnz .Lround4_with_poly1305_outer; /* tmp := X15 */ movdqa (STACK_TMP)(%rsp), X11; pbroadcastd((0 * 4)(INPUT), X15); PLUS(X0, X15); pbroadcastd((1 * 4)(INPUT), X15); PLUS(X1, X15); pbroadcastd((2 * 4)(INPUT), X15); PLUS(X2, X15); pbroadcastd((3 * 4)(INPUT), X15); PLUS(X3, X15); pbroadcastd((4 * 4)(INPUT), X15); PLUS(X4, X15); pbroadcastd((5 * 4)(INPUT), X15); PLUS(X5, X15); pbroadcastd((6 * 4)(INPUT), X15); PLUS(X6, X15); pbroadcastd((7 * 4)(INPUT), X15); PLUS(X7, X15); pbroadcastd((8 * 4)(INPUT), X15); PLUS(X8, X15); pbroadcastd((9 * 4)(INPUT), X15); PLUS(X9, X15); pbroadcastd((10 * 4)(INPUT), X15); PLUS(X10, X15); pbroadcastd((11 * 4)(INPUT), X15); PLUS(X11, X15); movdqa (STACK_VEC_X12)(%rsp), X15; PLUS(X12, X15); movdqa (STACK_VEC_X13)(%rsp), X15; PLUS(X13, X15); movdqa X13, (STACK_TMP)(%rsp); pbroadcastd((14 * 4)(INPUT), X15); PLUS(X14, X15); movdqa (STACK_TMP1)(%rsp), X15; movdqa X14, (STACK_TMP1)(%rsp); pbroadcastd((15 * 4)(INPUT), X13); PLUS(X15, X13); movdqa X15, (STACK_TMP2)(%rsp); /* Update counter */ addq $4, (12 * 4)(INPUT); movq (STACK_MAX + 5 * 8)(%rsp), SRC; movq (STACK_MAX + 6 * 8)(%rsp), DST; transpose_4x4(X0, X1, X2, X3, X13, X14, X15); xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15); xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15); xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15); xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15); transpose_4x4(X4, X5, X6, X7, X0, X1, X2); movdqa (STACK_TMP)(%rsp), X13; movdqa (STACK_TMP1)(%rsp), X14; movdqa (STACK_TMP2)(%rsp), X15; xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0); transpose_4x4(X8, X9, X10, X11, X0, X1, X2); xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0); transpose_4x4(X12, X13, X14, X15, X0, X1, X2); xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0); subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS lea (4 * 64)(DST), DST; lea (4 * 64)(SRC), SRC; movq SRC, (STACK_MAX + 5 * 8)(%rsp); movq DST, (STACK_MAX + 6 * 8)(%rsp); jnz .Loop_poly4; /* Store state */ POLY1305_STORE_STATE(); /* clear the used vector registers and stack */ clear(X0); movdqa X0, (STACK_VEC_X12)(%rsp); movdqa X0, (STACK_VEC_X13)(%rsp); movdqa X0, (STACK_TMP)(%rsp); movdqa X0, (STACK_TMP1)(%rsp); movdqa X0, (STACK_TMP2)(%rsp); clear(X1); clear(X2); clear(X3); clear(X4); clear(X5); clear(X6); clear(X7); clear(X8); clear(X9); clear(X10); clear(X11); clear(X12); clear(X13); clear(X14); clear(X15); movq (STACK_MAX + 0 * 8)(%rsp), %rbx; movq (STACK_MAX + 1 * 8)(%rsp), %r12; movq (STACK_MAX + 2 * 8)(%rsp), %r13; movq (STACK_MAX + 3 * 8)(%rsp), %r14; movq (STACK_MAX + 4 * 8)(%rsp), %r15; CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); xorl %eax, %eax; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;) /********************************************************************** 2-way && 1-way stitched chacha20-poly1305 **********************************************************************/ .align 8 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1,@function;) _gcry_chacha20_poly1305_amd64_ssse3_blocks1: /* input: * %rdi: chacha20-state * %rsi: dst * %rdx: src * %rcx: nblks * %r9: poly1305-state * %r8: poly1305-src */ CFI_STARTPROC(); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $(9 * 8), %rsp; movq %rbx, (0 * 8)(%rsp); movq %r12, (1 * 8)(%rsp); movq %r13, (2 * 8)(%rsp); movq %r14, (3 * 8)(%rsp); movq %r15, (4 * 8)(%rsp); CFI_REG_ON_STACK(rbx, 0 * 8); CFI_REG_ON_STACK(r12, 1 * 8); CFI_REG_ON_STACK(r13, 2 * 8); CFI_REG_ON_STACK(r14, 3 * 8); CFI_REG_ON_STACK(r15, 4 * 8); movq %rdx, (5 * 8)(%rsp); # SRC movq %rsi, (6 * 8)(%rsp); # DST movq %rcx, (7 * 8)(%rsp); # NBLKS /* Load constants */ movdqa .Lcounter1 rRIP, X4; movdqa .Lshuf_rol8 rRIP, X5; movdqa .Lshuf_rol16 rRIP, X6; /* Load state */ movdqu (0 * 4)(INPUT), X10; movdqu (4 * 4)(INPUT), X11; movdqu (8 * 4)(INPUT), X12; movdqu (12 * 4)(INPUT), X13; POLY1305_LOAD_STATE(); cmpq $2, (7 * 8)(%rsp); #NBLKS jb .Loop_poly1; movdqa X10, X0; movdqa X11, X1; movdqa X12, X2; movdqa X13, X3; movdqa X10, X8; movdqa X11, X9; movdqa X12, X14; movdqa X13, X15; paddq X4, X15; /* Process two ChaCha20 blocks and eight Poly1305 blocks. */ movl $20, (8 * 8 + 4)(%rsp); .Lround2_with_poly1305_outer: movl $8, (8 * 8)(%rsp); .Lround2_with_poly1305_inner: POLY1305_BLOCK_PART1(0 * 16); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); lea (1 * 16)(POLY_RSRC), POLY_RSRC; POLY1305_BLOCK_PART2(); QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); POLY1305_BLOCK_PART3(); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); POLY1305_BLOCK_PART4(); QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); POLY1305_BLOCK_PART5(); subl $2, (8 * 8)(%rsp); jnz .Lround2_with_poly1305_inner; QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39); subl $10, (8 * 8 + 4)(%rsp); jnz .Lround2_with_poly1305_outer; movq (5 * 8)(%rsp), SRC; movq (6 * 8)(%rsp), DST; PLUS(X0, X10); PLUS(X1, X11); PLUS(X2, X12); PLUS(X3, X13); /* Update counter */ paddq X4, X13; PLUS(X8, X10); PLUS(X9, X11); PLUS(X14, X12); PLUS(X15, X13); /* Update counter */ paddq X4, X13; xor_src_dst(DST, SRC, 0 * 4, X0, X7); xor_src_dst(DST, SRC, 4 * 4, X1, X7); xor_src_dst(DST, SRC, 8 * 4, X2, X7); xor_src_dst(DST, SRC, 12 * 4, X3, X7); xor_src_dst(DST, SRC, 16 * 4, X8, X7); xor_src_dst(DST, SRC, 20 * 4, X9, X7); xor_src_dst(DST, SRC, 24 * 4, X14, X7); xor_src_dst(DST, SRC, 28 * 4, X15, X7); clear(X8); clear(X9); clear(X14); clear(X15); subq $2, (7 * 8)(%rsp); # NBLKS lea (2 * 64)(SRC), SRC; lea (2 * 64)(DST), DST; movq SRC, (5 * 8)(%rsp); movq DST, (6 * 8)(%rsp); jz .Ldone_poly1; .Loop_poly1: movdqa X10, X0; movdqa X11, X1; movdqa X12, X2; movdqa X13, X3; /* Process one ChaCha20 block and four Poly1305 blocks. */ movl $20, (8 * 8 + 4)(%rsp); .Lround1_with_poly1305_outer: movl $8, (8 * 8)(%rsp); .Lround1_with_poly1305_inner: POLY1305_BLOCK_PART1(0 * 16); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); POLY1305_BLOCK_PART2(); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); lea (1 * 16)(POLY_RSRC), POLY_RSRC; POLY1305_BLOCK_PART3(); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); POLY1305_BLOCK_PART4(); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); POLY1305_BLOCK_PART5(); subl $4, (8 * 8)(%rsp); jnz .Lround1_with_poly1305_inner; QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); subl $10, (8 * 8 + 4)(%rsp); jnz .Lround1_with_poly1305_outer; movq (5 * 8)(%rsp), SRC; movq (6 * 8)(%rsp), DST; PLUS(X0, X10); PLUS(X1, X11); PLUS(X2, X12); PLUS(X3, X13); /* Update counter */ paddq X4, X13; xor_src_dst(DST, SRC, 0 * 4, X0, X7); xor_src_dst(DST, SRC, 4 * 4, X1, X7); xor_src_dst(DST, SRC, 8 * 4, X2, X7); xor_src_dst(DST, SRC, 12 * 4, X3, X7); subq $1, (7 * 8)(%rsp); # NBLKS lea (64)(SRC), SRC; lea (64)(DST), DST; movq SRC, (5 * 8)(%rsp); movq DST, (6 * 8)(%rsp); jnz .Loop_poly1; .Ldone_poly1: /* Store state */ POLY1305_STORE_STATE(); movdqu X13, (12 * 4)(INPUT); /* clear the used vector registers */ clear(X0); clear(X1); clear(X2); clear(X3); clear(X4); clear(X5); clear(X6); clear(X7); clear(X10); clear(X11); clear(X12); clear(X13); movq (0 * 8)(%rsp), %rbx; movq (1 * 8)(%rsp), %r12; movq (2 * 8)(%rsp), %r13; movq (3 * 8)(%rsp), %r14; movq (4 * 8)(%rsp), %r15; CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); xorl %eax, %eax; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;) #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S index a211dac3..c1bf9f29 100644 --- a/cipher/des-amd64.S +++ b/cipher/des-amd64.S @@ -1,1111 +1,1111 @@ /* des-amd64.S - AMD64 assembly implementation of 3DES cipher * * Copyright (C) 2014 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if defined(USE_DES) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #include "asm-common-amd64.h" .text #define s1 0 #define s2 ((s1) + (64*8)) #define s3 ((s2) + (64*8)) #define s4 ((s3) + (64*8)) #define s5 ((s4) + (64*8)) #define s6 ((s5) + (64*8)) #define s7 ((s6) + (64*8)) #define s8 ((s7) + (64*8)) /* register macros */ #define CTX %rdi #define SBOXES %rbp #define RL0 %r8 #define RL1 %r9 #define RL2 %r10 #define RL0d %r8d #define RL1d %r9d #define RL2d %r10d #define RR0 %r11 #define RR1 %r12 #define RR2 %r13 #define RR0d %r11d #define RR1d %r12d #define RR2d %r13d #define RW0 %rax #define RW1 %rbx #define RW2 %rcx #define RW0d %eax #define RW1d %ebx #define RW2d %ecx #define RW0bl %al #define RW1bl %bl #define RW2bl %cl #define RW0bh %ah #define RW1bh %bh #define RW2bh %ch #define RT0 %r15 #define RT1 %rsi #define RT2 %r14 #define RT3 %rdx #define RT0d %r15d #define RT1d %esi #define RT2d %r14d #define RT3d %edx /*********************************************************************** * 1-way 3DES ***********************************************************************/ #define do_permutation(a, b, offset, mask) \ movl a, RT0d; \ shrl $(offset), RT0d; \ xorl b, RT0d; \ andl $(mask), RT0d; \ xorl RT0d, b; \ shll $(offset), RT0d; \ xorl RT0d, a; #define expand_to_64bits(val, mask) \ movl val##d, RT0d; \ rorl $4, RT0d; \ shlq $32, RT0; \ orq RT0, val; \ andq mask, val; #define compress_to_64bits(val) \ movq val, RT0; \ shrq $32, RT0; \ roll $4, RT0d; \ orl RT0d, val##d; #define initial_permutation(left, right) \ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \ do_permutation(left##d, right##d, 16, 0x0000ffff); \ do_permutation(right##d, left##d, 2, 0x33333333); \ do_permutation(right##d, left##d, 8, 0x00ff00ff); \ movabs $0x3f3f3f3f3f3f3f3f, RT3; \ movl left##d, RW0d; \ roll $1, right##d; \ xorl right##d, RW0d; \ andl $0xaaaaaaaa, RW0d; \ xorl RW0d, left##d; \ xorl RW0d, right##d; \ roll $1, left##d; \ expand_to_64bits(right, RT3); \ expand_to_64bits(left, RT3); #define final_permutation(left, right) \ compress_to_64bits(right); \ compress_to_64bits(left); \ movl right##d, RW0d; \ rorl $1, left##d; \ xorl left##d, RW0d; \ andl $0xaaaaaaaa, RW0d; \ xorl RW0d, right##d; \ xorl RW0d, left##d; \ rorl $1, right##d; \ do_permutation(right##d, left##d, 8, 0x00ff00ff); \ do_permutation(right##d, left##d, 2, 0x33333333); \ do_permutation(left##d, right##d, 16, 0x0000ffff); \ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); #define round1(n, from, to, load_next_key) \ xorq from, RW0; \ \ movzbl RW0bl, RT0d; \ movzbl RW0bh, RT1d; \ shrq $16, RW0; \ movzbl RW0bl, RT2d; \ movzbl RW0bh, RT3d; \ shrq $16, RW0; \ movq s8(SBOXES, RT0, 8), RT0; \ xorq s6(SBOXES, RT1, 8), to; \ movzbl RW0bl, RL1d; \ movzbl RW0bh, RT1d; \ shrl $16, RW0d; \ xorq s4(SBOXES, RT2, 8), RT0; \ xorq s2(SBOXES, RT3, 8), to; \ movzbl RW0bl, RT2d; \ movzbl RW0bh, RT3d; \ xorq s7(SBOXES, RL1, 8), RT0; \ xorq s5(SBOXES, RT1, 8), to; \ xorq s3(SBOXES, RT2, 8), RT0; \ load_next_key(n, RW0); \ xorq RT0, to; \ xorq s1(SBOXES, RT3, 8), to; \ #define load_next_key(n, RWx) \ movq (((n) + 1) * 8)(CTX), RWx; #define dummy2(a, b) /*_*/ #define read_block(io, left, right) \ movl (io), left##d; \ movl 4(io), right##d; \ bswapl left##d; \ bswapl right##d; #define write_block(io, left, right) \ bswapl left##d; \ bswapl right##d; \ movl left##d, (io); \ movl right##d, 4(io); .align 8 .globl _gcry_3des_amd64_crypt_block ELF(.type _gcry_3des_amd64_crypt_block,@function;) _gcry_3des_amd64_crypt_block: /* input: * %rdi: round keys, CTX * %rsi: dst * %rdx: src */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); pushq %r14; CFI_PUSH(%r14); pushq %r15; CFI_PUSH(%r15); pushq %rsi; /*dst*/ CFI_PUSH(%rsi); leaq .L_s1 rRIP, SBOXES; read_block(%rdx, RL0, RR0); initial_permutation(RL0, RR0); movq (CTX), RW0; round1(0, RR0, RL0, load_next_key); round1(1, RL0, RR0, load_next_key); round1(2, RR0, RL0, load_next_key); round1(3, RL0, RR0, load_next_key); round1(4, RR0, RL0, load_next_key); round1(5, RL0, RR0, load_next_key); round1(6, RR0, RL0, load_next_key); round1(7, RL0, RR0, load_next_key); round1(8, RR0, RL0, load_next_key); round1(9, RL0, RR0, load_next_key); round1(10, RR0, RL0, load_next_key); round1(11, RL0, RR0, load_next_key); round1(12, RR0, RL0, load_next_key); round1(13, RL0, RR0, load_next_key); round1(14, RR0, RL0, load_next_key); round1(15, RL0, RR0, load_next_key); round1(16+0, RL0, RR0, load_next_key); round1(16+1, RR0, RL0, load_next_key); round1(16+2, RL0, RR0, load_next_key); round1(16+3, RR0, RL0, load_next_key); round1(16+4, RL0, RR0, load_next_key); round1(16+5, RR0, RL0, load_next_key); round1(16+6, RL0, RR0, load_next_key); round1(16+7, RR0, RL0, load_next_key); round1(16+8, RL0, RR0, load_next_key); round1(16+9, RR0, RL0, load_next_key); round1(16+10, RL0, RR0, load_next_key); round1(16+11, RR0, RL0, load_next_key); round1(16+12, RL0, RR0, load_next_key); round1(16+13, RR0, RL0, load_next_key); round1(16+14, RL0, RR0, load_next_key); round1(16+15, RR0, RL0, load_next_key); round1(32+0, RR0, RL0, load_next_key); round1(32+1, RL0, RR0, load_next_key); round1(32+2, RR0, RL0, load_next_key); round1(32+3, RL0, RR0, load_next_key); round1(32+4, RR0, RL0, load_next_key); round1(32+5, RL0, RR0, load_next_key); round1(32+6, RR0, RL0, load_next_key); round1(32+7, RL0, RR0, load_next_key); round1(32+8, RR0, RL0, load_next_key); round1(32+9, RL0, RR0, load_next_key); round1(32+10, RR0, RL0, load_next_key); round1(32+11, RL0, RR0, load_next_key); round1(32+12, RR0, RL0, load_next_key); round1(32+13, RL0, RR0, load_next_key); round1(32+14, RR0, RL0, load_next_key); round1(32+15, RL0, RR0, dummy2); popq RW2; /*dst*/ CFI_POP_TMP_REG(); final_permutation(RR0, RL0); write_block(RW2, RR0, RL0); popq %r15; CFI_POP(%r15); popq %r14; CFI_POP(%r14); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;) /*********************************************************************** * 3-way 3DES ***********************************************************************/ #define expand_to_64bits(val, mask) \ movl val##d, RT0d; \ rorl $4, RT0d; \ shlq $32, RT0; \ orq RT0, val; \ andq mask, val; #define compress_to_64bits(val) \ movq val, RT0; \ shrq $32, RT0; \ roll $4, RT0d; \ orl RT0d, val##d; #define initial_permutation3(left, right) \ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ \ do_permutation(right##0d, left##0d, 2, 0x33333333); \ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ do_permutation(right##1d, left##1d, 2, 0x33333333); \ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ do_permutation(right##2d, left##2d, 2, 0x33333333); \ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ \ movabs $0x3f3f3f3f3f3f3f3f, RT3; \ \ movl left##0d, RW0d; \ roll $1, right##0d; \ xorl right##0d, RW0d; \ andl $0xaaaaaaaa, RW0d; \ xorl RW0d, left##0d; \ xorl RW0d, right##0d; \ roll $1, left##0d; \ expand_to_64bits(right##0, RT3); \ expand_to_64bits(left##0, RT3); \ movl left##1d, RW1d; \ roll $1, right##1d; \ xorl right##1d, RW1d; \ andl $0xaaaaaaaa, RW1d; \ xorl RW1d, left##1d; \ xorl RW1d, right##1d; \ roll $1, left##1d; \ expand_to_64bits(right##1, RT3); \ expand_to_64bits(left##1, RT3); \ movl left##2d, RW2d; \ roll $1, right##2d; \ xorl right##2d, RW2d; \ andl $0xaaaaaaaa, RW2d; \ xorl RW2d, left##2d; \ xorl RW2d, right##2d; \ roll $1, left##2d; \ expand_to_64bits(right##2, RT3); \ expand_to_64bits(left##2, RT3); #define final_permutation3(left, right) \ compress_to_64bits(right##0); \ compress_to_64bits(left##0); \ movl right##0d, RW0d; \ rorl $1, left##0d; \ xorl left##0d, RW0d; \ andl $0xaaaaaaaa, RW0d; \ xorl RW0d, right##0d; \ xorl RW0d, left##0d; \ rorl $1, right##0d; \ compress_to_64bits(right##1); \ compress_to_64bits(left##1); \ movl right##1d, RW1d; \ rorl $1, left##1d; \ xorl left##1d, RW1d; \ andl $0xaaaaaaaa, RW1d; \ xorl RW1d, right##1d; \ xorl RW1d, left##1d; \ rorl $1, right##1d; \ compress_to_64bits(right##2); \ compress_to_64bits(left##2); \ movl right##2d, RW2d; \ rorl $1, left##2d; \ xorl left##2d, RW2d; \ andl $0xaaaaaaaa, RW2d; \ xorl RW2d, right##2d; \ xorl RW2d, left##2d; \ rorl $1, right##2d; \ \ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ do_permutation(right##0d, left##0d, 2, 0x33333333); \ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ do_permutation(right##1d, left##1d, 2, 0x33333333); \ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ do_permutation(right##2d, left##2d, 2, 0x33333333); \ \ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); #define round3(n, from, to, load_next_key, do_movq) \ xorq from##0, RW0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrq $16, RW0; \ xorq s8(SBOXES, RT3, 8), to##0; \ xorq s6(SBOXES, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrq $16, RW0; \ xorq s4(SBOXES, RT3, 8), to##0; \ xorq s2(SBOXES, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrl $16, RW0d; \ xorq s7(SBOXES, RT3, 8), to##0; \ xorq s5(SBOXES, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ load_next_key(n, RW0); \ xorq s3(SBOXES, RT3, 8), to##0; \ xorq s1(SBOXES, RT1, 8), to##0; \ xorq from##1, RW1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrq $16, RW1; \ xorq s8(SBOXES, RT3, 8), to##1; \ xorq s6(SBOXES, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrq $16, RW1; \ xorq s4(SBOXES, RT3, 8), to##1; \ xorq s2(SBOXES, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrl $16, RW1d; \ xorq s7(SBOXES, RT3, 8), to##1; \ xorq s5(SBOXES, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ do_movq(RW0, RW1); \ xorq s3(SBOXES, RT3, 8), to##1; \ xorq s1(SBOXES, RT1, 8), to##1; \ xorq from##2, RW2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrq $16, RW2; \ xorq s8(SBOXES, RT3, 8), to##2; \ xorq s6(SBOXES, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrq $16, RW2; \ xorq s4(SBOXES, RT3, 8), to##2; \ xorq s2(SBOXES, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrl $16, RW2d; \ xorq s7(SBOXES, RT3, 8), to##2; \ xorq s5(SBOXES, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ do_movq(RW0, RW2); \ xorq s3(SBOXES, RT3, 8), to##2; \ xorq s1(SBOXES, RT1, 8), to##2; #define __movq(src, dst) \ movq src, dst; #define read_block(io, left, right) \ movl (io), left##d; \ movl 4(io), right##d; \ bswapl left##d; \ bswapl right##d; #define write_block(io, left, right) \ bswapl left##d; \ bswapl right##d; \ movl left##d, (io); \ movl right##d, 4(io); .align 8 ELF(.type _gcry_3des_amd64_crypt_blk3,@function;) _gcry_3des_amd64_crypt_blk3: /* input: * %rdi: round keys, CTX * RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks * RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks */ CFI_STARTPROC(); leaq .L_s1 rRIP, SBOXES; initial_permutation3(RL, RR); movq 0(CTX), RW0; movq RW0, RW1; movq RW0, RW2; round3(0, RR, RL, load_next_key, __movq); round3(1, RL, RR, load_next_key, __movq); round3(2, RR, RL, load_next_key, __movq); round3(3, RL, RR, load_next_key, __movq); round3(4, RR, RL, load_next_key, __movq); round3(5, RL, RR, load_next_key, __movq); round3(6, RR, RL, load_next_key, __movq); round3(7, RL, RR, load_next_key, __movq); round3(8, RR, RL, load_next_key, __movq); round3(9, RL, RR, load_next_key, __movq); round3(10, RR, RL, load_next_key, __movq); round3(11, RL, RR, load_next_key, __movq); round3(12, RR, RL, load_next_key, __movq); round3(13, RL, RR, load_next_key, __movq); round3(14, RR, RL, load_next_key, __movq); round3(15, RL, RR, load_next_key, __movq); round3(16+0, RL, RR, load_next_key, __movq); round3(16+1, RR, RL, load_next_key, __movq); round3(16+2, RL, RR, load_next_key, __movq); round3(16+3, RR, RL, load_next_key, __movq); round3(16+4, RL, RR, load_next_key, __movq); round3(16+5, RR, RL, load_next_key, __movq); round3(16+6, RL, RR, load_next_key, __movq); round3(16+7, RR, RL, load_next_key, __movq); round3(16+8, RL, RR, load_next_key, __movq); round3(16+9, RR, RL, load_next_key, __movq); round3(16+10, RL, RR, load_next_key, __movq); round3(16+11, RR, RL, load_next_key, __movq); round3(16+12, RL, RR, load_next_key, __movq); round3(16+13, RR, RL, load_next_key, __movq); round3(16+14, RL, RR, load_next_key, __movq); round3(16+15, RR, RL, load_next_key, __movq); round3(32+0, RR, RL, load_next_key, __movq); round3(32+1, RL, RR, load_next_key, __movq); round3(32+2, RR, RL, load_next_key, __movq); round3(32+3, RL, RR, load_next_key, __movq); round3(32+4, RR, RL, load_next_key, __movq); round3(32+5, RL, RR, load_next_key, __movq); round3(32+6, RR, RL, load_next_key, __movq); round3(32+7, RL, RR, load_next_key, __movq); round3(32+8, RR, RL, load_next_key, __movq); round3(32+9, RL, RR, load_next_key, __movq); round3(32+10, RR, RL, load_next_key, __movq); round3(32+11, RL, RR, load_next_key, __movq); round3(32+12, RR, RL, load_next_key, __movq); round3(32+13, RL, RR, load_next_key, __movq); round3(32+14, RR, RL, load_next_key, __movq); round3(32+15, RL, RR, dummy2, dummy2); final_permutation3(RR, RL); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;) .align 8 .globl _gcry_3des_amd64_cbc_dec ELF(.type _gcry_3des_amd64_cbc_dec,@function;) _gcry_3des_amd64_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: iv (64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); pushq %r14; CFI_PUSH(%r14); pushq %r15; CFI_PUSH(%r15); pushq %rsi; /*dst*/ CFI_PUSH(%rsi); pushq %rdx; /*src*/ CFI_PUSH(%rdx); pushq %rcx; /*iv*/ CFI_PUSH(%rcx); /* load input */ movl 0 * 4(%rdx), RL0d; movl 1 * 4(%rdx), RR0d; movl 2 * 4(%rdx), RL1d; movl 3 * 4(%rdx), RR1d; movl 4 * 4(%rdx), RL2d; movl 5 * 4(%rdx), RR2d; bswapl RL0d; bswapl RR0d; bswapl RL1d; bswapl RR1d; bswapl RL2d; bswapl RR2d; call _gcry_3des_amd64_crypt_blk3; popq %rcx; /*iv*/ CFI_POP_TMP_REG(); popq %rdx; /*src*/ CFI_POP_TMP_REG(); popq %rsi; /*dst*/ CFI_POP_TMP_REG(); bswapl RR0d; bswapl RL0d; bswapl RR1d; bswapl RL1d; bswapl RR2d; bswapl RL2d; movq 2 * 8(%rdx), RT0; xorl 0 * 4(%rcx), RR0d; xorl 1 * 4(%rcx), RL0d; xorl 0 * 4(%rdx), RR1d; xorl 1 * 4(%rdx), RL1d; xorl 2 * 4(%rdx), RR2d; xorl 3 * 4(%rdx), RL2d; movq RT0, (%rcx); /* store new IV */ movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); movl RL1d, 3 * 4(%rsi); movl RR2d, 4 * 4(%rsi); movl RL2d, 5 * 4(%rsi); popq %r15; CFI_POP(%r15); popq %r14; CFI_POP(%r14); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) .align 8 .globl _gcry_3des_amd64_ctr_enc ELF(.type _gcry_3des_amd64_ctr_enc,@function;) _gcry_3des_amd64_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: iv (64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); pushq %r14; CFI_PUSH(%r14); pushq %r15; CFI_PUSH(%r15); pushq %rsi; /*dst*/ CFI_PUSH(%rsi); pushq %rdx; /*src*/ CFI_PUSH(%rdx); movq %rcx, RW2; /* load IV and byteswap */ movq (RW2), RT0; bswapq RT0; movq RT0, RR0; /* construct IVs */ leaq 1(RT0), RR1; leaq 2(RT0), RR2; leaq 3(RT0), RT0; movq RR0, RL0; movq RR1, RL1; movq RR2, RL2; bswapq RT0; shrq $32, RL0; shrq $32, RL1; shrq $32, RL2; /* store new IV */ movq RT0, (RW2); call _gcry_3des_amd64_crypt_blk3; popq %rdx; /*src*/ CFI_POP_TMP_REG(); popq %rsi; /*dst*/ CFI_POP_TMP_REG(); bswapl RR0d; bswapl RL0d; bswapl RR1d; bswapl RL1d; bswapl RR2d; bswapl RL2d; xorl 0 * 4(%rdx), RR0d; xorl 1 * 4(%rdx), RL0d; xorl 2 * 4(%rdx), RR1d; xorl 3 * 4(%rdx), RL1d; xorl 4 * 4(%rdx), RR2d; xorl 5 * 4(%rdx), RL2d; movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); movl RL1d, 3 * 4(%rsi); movl RR2d, 4 * 4(%rsi); movl RL2d, 5 * 4(%rsi); popq %r15; CFI_POP(%r15); popq %r14; CFI_POP(%r14); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) .align 8 .globl _gcry_3des_amd64_cfb_dec ELF(.type _gcry_3des_amd64_cfb_dec,@function;) _gcry_3des_amd64_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: iv (64bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; CFI_PUSH(%rbp); pushq %rbx; CFI_PUSH(%rbx); pushq %r12; CFI_PUSH(%r12); pushq %r13; CFI_PUSH(%r13); pushq %r14; CFI_PUSH(%r14); pushq %r15; CFI_PUSH(%r15); pushq %rsi; /*dst*/ CFI_PUSH(%rsi); pushq %rdx; /*src*/ CFI_PUSH(%rdx); movq %rcx, RW2; /* Load input */ movl 0 * 4(RW2), RL0d; movl 1 * 4(RW2), RR0d; movl 0 * 4(%rdx), RL1d; movl 1 * 4(%rdx), RR1d; movl 2 * 4(%rdx), RL2d; movl 3 * 4(%rdx), RR2d; bswapl RL0d; bswapl RR0d; bswapl RL1d; bswapl RR1d; bswapl RL2d; bswapl RR2d; /* Update IV */ movq 4 * 4(%rdx), RW0; movq RW0, (RW2); call _gcry_3des_amd64_crypt_blk3; popq %rdx; /*src*/ CFI_POP_TMP_REG(); popq %rsi; /*dst*/ CFI_POP_TMP_REG(); bswapl RR0d; bswapl RL0d; bswapl RR1d; bswapl RL1d; bswapl RR2d; bswapl RL2d; xorl 0 * 4(%rdx), RR0d; xorl 1 * 4(%rdx), RL0d; xorl 2 * 4(%rdx), RR1d; xorl 3 * 4(%rdx), RL1d; xorl 4 * 4(%rdx), RR2d; xorl 5 * 4(%rdx), RL2d; movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); movl RL1d, 3 * 4(%rsi); movl RR2d, 4 * 4(%rsi); movl RL2d, 5 * 4(%rsi); popq %r15; CFI_POP(%r15); popq %r14; CFI_POP(%r14); popq %r13; CFI_POP(%r13); popq %r12; CFI_POP(%r12); popq %rbx; CFI_POP(%rbx); popq %rbp; CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;) .align 16 .L_s1: .quad 0x0010100001010400, 0x0000000000000000 .quad 0x0000100000010000, 0x0010100001010404 .quad 0x0010100001010004, 0x0000100000010404 .quad 0x0000000000000004, 0x0000100000010000 .quad 0x0000000000000400, 0x0010100001010400 .quad 0x0010100001010404, 0x0000000000000400 .quad 0x0010000001000404, 0x0010100001010004 .quad 0x0010000001000000, 0x0000000000000004 .quad 0x0000000000000404, 0x0010000001000400 .quad 0x0010000001000400, 0x0000100000010400 .quad 0x0000100000010400, 0x0010100001010000 .quad 0x0010100001010000, 0x0010000001000404 .quad 0x0000100000010004, 0x0010000001000004 .quad 0x0010000001000004, 0x0000100000010004 .quad 0x0000000000000000, 0x0000000000000404 .quad 0x0000100000010404, 0x0010000001000000 .quad 0x0000100000010000, 0x0010100001010404 .quad 0x0000000000000004, 0x0010100001010000 .quad 0x0010100001010400, 0x0010000001000000 .quad 0x0010000001000000, 0x0000000000000400 .quad 0x0010100001010004, 0x0000100000010000 .quad 0x0000100000010400, 0x0010000001000004 .quad 0x0000000000000400, 0x0000000000000004 .quad 0x0010000001000404, 0x0000100000010404 .quad 0x0010100001010404, 0x0000100000010004 .quad 0x0010100001010000, 0x0010000001000404 .quad 0x0010000001000004, 0x0000000000000404 .quad 0x0000100000010404, 0x0010100001010400 .quad 0x0000000000000404, 0x0010000001000400 .quad 0x0010000001000400, 0x0000000000000000 .quad 0x0000100000010004, 0x0000100000010400 .quad 0x0000000000000000, 0x0010100001010004 .L_s2: .quad 0x0801080200100020, 0x0800080000000000 .quad 0x0000080000000000, 0x0001080200100020 .quad 0x0001000000100000, 0x0000000200000020 .quad 0x0801000200100020, 0x0800080200000020 .quad 0x0800000200000020, 0x0801080200100020 .quad 0x0801080000100000, 0x0800000000000000 .quad 0x0800080000000000, 0x0001000000100000 .quad 0x0000000200000020, 0x0801000200100020 .quad 0x0001080000100000, 0x0001000200100020 .quad 0x0800080200000020, 0x0000000000000000 .quad 0x0800000000000000, 0x0000080000000000 .quad 0x0001080200100020, 0x0801000000100000 .quad 0x0001000200100020, 0x0800000200000020 .quad 0x0000000000000000, 0x0001080000100000 .quad 0x0000080200000020, 0x0801080000100000 .quad 0x0801000000100000, 0x0000080200000020 .quad 0x0000000000000000, 0x0001080200100020 .quad 0x0801000200100020, 0x0001000000100000 .quad 0x0800080200000020, 0x0801000000100000 .quad 0x0801080000100000, 0x0000080000000000 .quad 0x0801000000100000, 0x0800080000000000 .quad 0x0000000200000020, 0x0801080200100020 .quad 0x0001080200100020, 0x0000000200000020 .quad 0x0000080000000000, 0x0800000000000000 .quad 0x0000080200000020, 0x0801080000100000 .quad 0x0001000000100000, 0x0800000200000020 .quad 0x0001000200100020, 0x0800080200000020 .quad 0x0800000200000020, 0x0001000200100020 .quad 0x0001080000100000, 0x0000000000000000 .quad 0x0800080000000000, 0x0000080200000020 .quad 0x0800000000000000, 0x0801000200100020 .quad 0x0801080200100020, 0x0001080000100000 .L_s3: .quad 0x0000002000000208, 0x0000202008020200 .quad 0x0000000000000000, 0x0000200008020008 .quad 0x0000002008000200, 0x0000000000000000 .quad 0x0000202000020208, 0x0000002008000200 .quad 0x0000200000020008, 0x0000000008000008 .quad 0x0000000008000008, 0x0000200000020000 .quad 0x0000202008020208, 0x0000200000020008 .quad 0x0000200008020000, 0x0000002000000208 .quad 0x0000000008000000, 0x0000000000000008 .quad 0x0000202008020200, 0x0000002000000200 .quad 0x0000202000020200, 0x0000200008020000 .quad 0x0000200008020008, 0x0000202000020208 .quad 0x0000002008000208, 0x0000202000020200 .quad 0x0000200000020000, 0x0000002008000208 .quad 0x0000000000000008, 0x0000202008020208 .quad 0x0000002000000200, 0x0000000008000000 .quad 0x0000202008020200, 0x0000000008000000 .quad 0x0000200000020008, 0x0000002000000208 .quad 0x0000200000020000, 0x0000202008020200 .quad 0x0000002008000200, 0x0000000000000000 .quad 0x0000002000000200, 0x0000200000020008 .quad 0x0000202008020208, 0x0000002008000200 .quad 0x0000000008000008, 0x0000002000000200 .quad 0x0000000000000000, 0x0000200008020008 .quad 0x0000002008000208, 0x0000200000020000 .quad 0x0000000008000000, 0x0000202008020208 .quad 0x0000000000000008, 0x0000202000020208 .quad 0x0000202000020200, 0x0000000008000008 .quad 0x0000200008020000, 0x0000002008000208 .quad 0x0000002000000208, 0x0000200008020000 .quad 0x0000202000020208, 0x0000000000000008 .quad 0x0000200008020008, 0x0000202000020200 .L_s4: .quad 0x1008020000002001, 0x1000020800002001 .quad 0x1000020800002001, 0x0000000800000000 .quad 0x0008020800002000, 0x1008000800000001 .quad 0x1008000000000001, 0x1000020000002001 .quad 0x0000000000000000, 0x0008020000002000 .quad 0x0008020000002000, 0x1008020800002001 .quad 0x1000000800000001, 0x0000000000000000 .quad 0x0008000800000000, 0x1008000000000001 .quad 0x1000000000000001, 0x0000020000002000 .quad 0x0008000000000000, 0x1008020000002001 .quad 0x0000000800000000, 0x0008000000000000 .quad 0x1000020000002001, 0x0000020800002000 .quad 0x1008000800000001, 0x1000000000000001 .quad 0x0000020800002000, 0x0008000800000000 .quad 0x0000020000002000, 0x0008020800002000 .quad 0x1008020800002001, 0x1000000800000001 .quad 0x0008000800000000, 0x1008000000000001 .quad 0x0008020000002000, 0x1008020800002001 .quad 0x1000000800000001, 0x0000000000000000 .quad 0x0000000000000000, 0x0008020000002000 .quad 0x0000020800002000, 0x0008000800000000 .quad 0x1008000800000001, 0x1000000000000001 .quad 0x1008020000002001, 0x1000020800002001 .quad 0x1000020800002001, 0x0000000800000000 .quad 0x1008020800002001, 0x1000000800000001 .quad 0x1000000000000001, 0x0000020000002000 .quad 0x1008000000000001, 0x1000020000002001 .quad 0x0008020800002000, 0x1008000800000001 .quad 0x1000020000002001, 0x0000020800002000 .quad 0x0008000000000000, 0x1008020000002001 .quad 0x0000000800000000, 0x0008000000000000 .quad 0x0000020000002000, 0x0008020800002000 .L_s5: .quad 0x0000001000000100, 0x0020001002080100 .quad 0x0020000002080000, 0x0420001002000100 .quad 0x0000000000080000, 0x0000001000000100 .quad 0x0400000000000000, 0x0020000002080000 .quad 0x0400001000080100, 0x0000000000080000 .quad 0x0020001002000100, 0x0400001000080100 .quad 0x0420001002000100, 0x0420000002080000 .quad 0x0000001000080100, 0x0400000000000000 .quad 0x0020000002000000, 0x0400000000080000 .quad 0x0400000000080000, 0x0000000000000000 .quad 0x0400001000000100, 0x0420001002080100 .quad 0x0420001002080100, 0x0020001002000100 .quad 0x0420000002080000, 0x0400001000000100 .quad 0x0000000000000000, 0x0420000002000000 .quad 0x0020001002080100, 0x0020000002000000 .quad 0x0420000002000000, 0x0000001000080100 .quad 0x0000000000080000, 0x0420001002000100 .quad 0x0000001000000100, 0x0020000002000000 .quad 0x0400000000000000, 0x0020000002080000 .quad 0x0420001002000100, 0x0400001000080100 .quad 0x0020001002000100, 0x0400000000000000 .quad 0x0420000002080000, 0x0020001002080100 .quad 0x0400001000080100, 0x0000001000000100 .quad 0x0020000002000000, 0x0420000002080000 .quad 0x0420001002080100, 0x0000001000080100 .quad 0x0420000002000000, 0x0420001002080100 .quad 0x0020000002080000, 0x0000000000000000 .quad 0x0400000000080000, 0x0420000002000000 .quad 0x0000001000080100, 0x0020001002000100 .quad 0x0400001000000100, 0x0000000000080000 .quad 0x0000000000000000, 0x0400000000080000 .quad 0x0020001002080100, 0x0400001000000100 .L_s6: .quad 0x0200000120000010, 0x0204000020000000 .quad 0x0000040000000000, 0x0204040120000010 .quad 0x0204000020000000, 0x0000000100000010 .quad 0x0204040120000010, 0x0004000000000000 .quad 0x0200040020000000, 0x0004040100000010 .quad 0x0004000000000000, 0x0200000120000010 .quad 0x0004000100000010, 0x0200040020000000 .quad 0x0200000020000000, 0x0000040100000010 .quad 0x0000000000000000, 0x0004000100000010 .quad 0x0200040120000010, 0x0000040000000000 .quad 0x0004040000000000, 0x0200040120000010 .quad 0x0000000100000010, 0x0204000120000010 .quad 0x0204000120000010, 0x0000000000000000 .quad 0x0004040100000010, 0x0204040020000000 .quad 0x0000040100000010, 0x0004040000000000 .quad 0x0204040020000000, 0x0200000020000000 .quad 0x0200040020000000, 0x0000000100000010 .quad 0x0204000120000010, 0x0004040000000000 .quad 0x0204040120000010, 0x0004000000000000 .quad 0x0000040100000010, 0x0200000120000010 .quad 0x0004000000000000, 0x0200040020000000 .quad 0x0200000020000000, 0x0000040100000010 .quad 0x0200000120000010, 0x0204040120000010 .quad 0x0004040000000000, 0x0204000020000000 .quad 0x0004040100000010, 0x0204040020000000 .quad 0x0000000000000000, 0x0204000120000010 .quad 0x0000000100000010, 0x0000040000000000 .quad 0x0204000020000000, 0x0004040100000010 .quad 0x0000040000000000, 0x0004000100000010 .quad 0x0200040120000010, 0x0000000000000000 .quad 0x0204040020000000, 0x0200000020000000 .quad 0x0004000100000010, 0x0200040120000010 .L_s7: .quad 0x0002000000200000, 0x2002000004200002 .quad 0x2000000004000802, 0x0000000000000000 .quad 0x0000000000000800, 0x2000000004000802 .quad 0x2002000000200802, 0x0002000004200800 .quad 0x2002000004200802, 0x0002000000200000 .quad 0x0000000000000000, 0x2000000004000002 .quad 0x2000000000000002, 0x0000000004000000 .quad 0x2002000004200002, 0x2000000000000802 .quad 0x0000000004000800, 0x2002000000200802 .quad 0x2002000000200002, 0x0000000004000800 .quad 0x2000000004000002, 0x0002000004200000 .quad 0x0002000004200800, 0x2002000000200002 .quad 0x0002000004200000, 0x0000000000000800 .quad 0x2000000000000802, 0x2002000004200802 .quad 0x0002000000200800, 0x2000000000000002 .quad 0x0000000004000000, 0x0002000000200800 .quad 0x0000000004000000, 0x0002000000200800 .quad 0x0002000000200000, 0x2000000004000802 .quad 0x2000000004000802, 0x2002000004200002 .quad 0x2002000004200002, 0x2000000000000002 .quad 0x2002000000200002, 0x0000000004000000 .quad 0x0000000004000800, 0x0002000000200000 .quad 0x0002000004200800, 0x2000000000000802 .quad 0x2002000000200802, 0x0002000004200800 .quad 0x2000000000000802, 0x2000000004000002 .quad 0x2002000004200802, 0x0002000004200000 .quad 0x0002000000200800, 0x0000000000000000 .quad 0x2000000000000002, 0x2002000004200802 .quad 0x0000000000000000, 0x2002000000200802 .quad 0x0002000004200000, 0x0000000000000800 .quad 0x2000000004000002, 0x0000000004000800 .quad 0x0000000000000800, 0x2002000000200002 .L_s8: .quad 0x0100010410001000, 0x0000010000001000 .quad 0x0000000000040000, 0x0100010410041000 .quad 0x0100000010000000, 0x0100010410001000 .quad 0x0000000400000000, 0x0100000010000000 .quad 0x0000000400040000, 0x0100000010040000 .quad 0x0100010410041000, 0x0000010000041000 .quad 0x0100010010041000, 0x0000010400041000 .quad 0x0000010000001000, 0x0000000400000000 .quad 0x0100000010040000, 0x0100000410000000 .quad 0x0100010010001000, 0x0000010400001000 .quad 0x0000010000041000, 0x0000000400040000 .quad 0x0100000410040000, 0x0100010010041000 .quad 0x0000010400001000, 0x0000000000000000 .quad 0x0000000000000000, 0x0100000410040000 .quad 0x0100000410000000, 0x0100010010001000 .quad 0x0000010400041000, 0x0000000000040000 .quad 0x0000010400041000, 0x0000000000040000 .quad 0x0100010010041000, 0x0000010000001000 .quad 0x0000000400000000, 0x0100000410040000 .quad 0x0000010000001000, 0x0000010400041000 .quad 0x0100010010001000, 0x0000000400000000 .quad 0x0100000410000000, 0x0100000010040000 .quad 0x0100000410040000, 0x0100000010000000 .quad 0x0000000000040000, 0x0100010410001000 .quad 0x0000000000000000, 0x0100010410041000 .quad 0x0000000400040000, 0x0100000410000000 .quad 0x0100000010040000, 0x0100010010001000 .quad 0x0100010410001000, 0x0000000000000000 .quad 0x0100010410041000, 0x0000010000041000 .quad 0x0000010000041000, 0x0000010400001000 .quad 0x0000010400001000, 0x0000000400040000 .quad 0x0100000010000000, 0x0100010010041000 #endif #endif diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S index 3dcaa856..6e3cc819 100644 --- a/cipher/rijndael-amd64.S +++ b/cipher/rijndael-amd64.S @@ -1,477 +1,477 @@ /* rinjdael-amd64.S - AMD64 assembly implementation of AES cipher * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_AES) #include "asm-common-amd64.h" .text /* table macros */ #define E0 (0) #define Es0 (1) #define Esize 4 #define Essize 4 #define D0 (0) #define Ds0 (4 * 256) #define Dsize 4 #define Dssize 1 /* register macros */ #define CTX %rdi #define RTAB %r12 #define RA %rax #define RB %rbx #define RC %rcx #define RD %rdx #define RAd %eax #define RBd %ebx #define RCd %ecx #define RDd %edx #define RAbl %al #define RBbl %bl #define RCbl %cl #define RDbl %dl #define RAbh %ah #define RBbh %bh #define RCbh %ch #define RDbh %dh #define RNA %r8 #define RNB %r9 #define RNC %r10 #define RND %r11 #define RNAd %r8d #define RNBd %r9d #define RNCd %r10d #define RNDd %r11d #define RT0 %rbp #define RT1 %rsi #define RT0d %ebp #define RT1d %esi /* helper macros */ #define do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ movzbl source ## bl, t0 ## d; \ movzbl source ## bh, t1 ## d; \ op ## l table1(RTAB,t0,tablemul), dest1 ## d; \ op ## l table2(RTAB,t1,tablemul), dest2 ## d; #define do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ movzbl source ## bl, t0 ## d; \ movzbl source ## bh, t1 ## d; \ shrl $(shf), source ## d; \ op ## l table1(RTAB,t0,tablemul), dest1 ## d; \ op ## l table2(RTAB,t1,tablemul), dest2 ## d; #define last_do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ movzbl source ## bl, t0 ## d; \ movzbl source ## bh, t1 ## d; \ movzbl table1(RTAB,t0,tablemul), t0 ## d; \ movzbl table2(RTAB,t1,tablemul), t1 ## d; \ op ## l t0 ## d, dest1 ## d; \ op ## l t1 ## d, dest2 ## d; #define last_do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \ movzbl source ## bl, t0 ## d; \ movzbl source ## bh, t1 ## d; \ shrl $(shf), source ## d; \ movzbl table1(RTAB,t0,tablemul), t0 ## d; \ movzbl table2(RTAB,t1,tablemul), t1 ## d; \ op ## l t0 ## d, dest1 ## d; \ op ## l t1 ## d, dest2 ## d; /*********************************************************************** * AMD64 assembly implementation of the AES cipher ***********************************************************************/ #define addroundkey(round, ra, rb, rc, rd) \ xorl (((round) * 16) + 0 * 4)(CTX), ra ## d; \ xorl (((round) * 16) + 1 * 4)(CTX), rb ## d; \ xorl (((round) * 16) + 2 * 4)(CTX), rc ## d; \ xorl (((round) * 16) + 3 * 4)(CTX), rd ## d; #define do_encround(next_r) \ do16bit_shr(16, mov, RA, Esize, E0, RNA, E0, RND, RT0, RT1); \ do16bit( mov, RA, Esize, E0, RNC, E0, RNB, RT0, RT1); \ movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ roll $8, RNDd; \ xorl RNAd, RAd; \ roll $8, RNCd; \ roll $8, RNBd; \ roll $8, RAd; \ \ do16bit_shr(16, xor, RD, Esize, E0, RND, E0, RNC, RT0, RT1); \ do16bit( xor, RD, Esize, E0, RNB, E0, RA, RT0, RT1); \ movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ roll $8, RNCd; \ xorl RNDd, RDd; \ roll $8, RNBd; \ roll $8, RAd; \ roll $8, RDd; \ \ do16bit_shr(16, xor, RC, Esize, E0, RNC, E0, RNB, RT0, RT1); \ do16bit( xor, RC, Esize, E0, RA, E0, RD, RT0, RT1); \ movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ roll $8, RNBd; \ xorl RNCd, RCd; \ roll $8, RAd; \ roll $8, RDd; \ roll $8, RCd; \ \ do16bit_shr(16, xor, RB, Esize, E0, RNB, E0, RA, RT0, RT1); \ do16bit( xor, RB, Esize, E0, RD, E0, RC, RT0, RT1); \ movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ roll $8, RAd; \ xorl RNBd, RBd; \ roll $16, RDd; \ roll $24, RCd; #define do_lastencround(next_r) \ do16bit_shr(16, movzb, RA, Essize, Es0, RNA, Es0, RND, RT0, RT1); \ do16bit( movzb, RA, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \ movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ roll $8, RNDd; \ xorl RNAd, RAd; \ roll $8, RNCd; \ roll $8, RNBd; \ roll $8, RAd; \ \ last_do16bit_shr(16, xor, RD, Essize, Es0, RND, Es0, RNC, RT0, RT1); \ last_do16bit( xor, RD, Essize, Es0, RNB, Es0, RA, RT0, RT1); \ movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ roll $8, RNCd; \ xorl RNDd, RDd; \ roll $8, RNBd; \ roll $8, RAd; \ roll $8, RDd; \ \ last_do16bit_shr(16, xor, RC, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \ last_do16bit( xor, RC, Essize, Es0, RA, Es0, RD, RT0, RT1); \ movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ roll $8, RNBd; \ xorl RNCd, RCd; \ roll $8, RAd; \ roll $8, RDd; \ roll $8, RCd; \ \ last_do16bit_shr(16, xor, RB, Essize, Es0, RNB, Es0, RA, RT0, RT1); \ last_do16bit( xor, RB, Essize, Es0, RD, Es0, RC, RT0, RT1); \ movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ roll $8, RAd; \ xorl RNBd, RBd; \ roll $16, RDd; \ roll $24, RCd; #define firstencround(round) \ addroundkey(round, RA, RB, RC, RD); \ do_encround((round) + 1); #define encround(round) \ do_encround((round) + 1); #define lastencround(round) \ do_lastencround((round) + 1); .align 8 .globl _gcry_aes_amd64_encrypt_block ELF(.type _gcry_aes_amd64_encrypt_block,@function;) _gcry_aes_amd64_encrypt_block: /* input: * %rdi: keysched, CTX * %rsi: dst * %rdx: src * %ecx: number of rounds.. 10, 12 or 14 * %r8: encryption tables */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 subq $(5 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(5 * 8); movq %rsi, (0 * 8)(%rsp); movl %ecx, (1 * 8)(%rsp); movq %rbp, (2 * 8)(%rsp); movq %rbx, (3 * 8)(%rsp); movq %r12, (4 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 2 * 8); CFI_REL_OFFSET(%rbx, 3 * 8); CFI_REL_OFFSET(%r12, 4 * 8); leaq (%r8), RTAB; /* read input block */ movl 0 * 4(%rdx), RAd; movl 1 * 4(%rdx), RBd; movl 2 * 4(%rdx), RCd; movl 3 * 4(%rdx), RDd; firstencround(0); encround(1); encround(2); encround(3); encround(4); encround(5); encround(6); encround(7); encround(8); cmpl $12, (1 * 8)(%rsp); jnb .Lenc_not_128; lastencround(9); .align 4 .Lenc_done: /* write output block */ movq (0 * 8)(%rsp), %rsi; movl RAd, 0 * 4(%rsi); movl RBd, 1 * 4(%rsi); movl RCd, 2 * 4(%rsi); movl RDd, 3 * 4(%rsi); CFI_REMEMBER_STATE(); movq (4 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %rbp; CFI_RESTORE(%r12); CFI_RESTORE(%rbx); CFI_RESTORE(%rbp); addq $(5 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-5 * 8); movl $(6 * 8), %eax; EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_RESTORE_STATE(); .align 4 .Lenc_not_128: je .Lenc_192 encround(9); encround(10); encround(11); encround(12); lastencround(13); jmp .Lenc_done; .align 4 .Lenc_192: encround(9); encround(10); lastencround(11); jmp .Lenc_done; CFI_ENDPROC(); ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;) #define do_decround(next_r) \ do16bit_shr(16, mov, RA, Dsize, D0, RNA, D0, RNB, RT0, RT1); \ do16bit( mov, RA, Dsize, D0, RNC, D0, RND, RT0, RT1); \ movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ roll $8, RNBd; \ xorl RNAd, RAd; \ roll $8, RNCd; \ roll $8, RNDd; \ roll $8, RAd; \ \ do16bit_shr(16, xor, RB, Dsize, D0, RNB, D0, RNC, RT0, RT1); \ do16bit( xor, RB, Dsize, D0, RND, D0, RA, RT0, RT1); \ movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ roll $8, RNCd; \ xorl RNBd, RBd; \ roll $8, RNDd; \ roll $8, RAd; \ roll $8, RBd; \ \ do16bit_shr(16, xor, RC, Dsize, D0, RNC, D0, RND, RT0, RT1); \ do16bit( xor, RC, Dsize, D0, RA, D0, RB, RT0, RT1); \ movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ roll $8, RNDd; \ xorl RNCd, RCd; \ roll $8, RAd; \ roll $8, RBd; \ roll $8, RCd; \ \ do16bit_shr(16, xor, RD, Dsize, D0, RND, D0, RA, RT0, RT1); \ do16bit( xor, RD, Dsize, D0, RB, D0, RC, RT0, RT1); \ movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ roll $8, RAd; \ xorl RNDd, RDd; \ roll $16, RBd; \ roll $24, RCd; #define do_lastdecround(next_r) \ do16bit_shr(16, movzb, RA, Dssize, Ds0, RNA, Ds0, RNB, RT0, RT1); \ do16bit( movzb, RA, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \ movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \ roll $8, RNBd; \ xorl RNAd, RAd; \ roll $8, RNCd; \ roll $8, RNDd; \ roll $8, RAd; \ \ last_do16bit_shr(16, xor, RB, Dssize, Ds0, RNB, Ds0, RNC, RT0, RT1); \ last_do16bit( xor, RB, Dssize, Ds0, RND, Ds0, RA, RT0, RT1); \ movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \ roll $8, RNCd; \ xorl RNBd, RBd; \ roll $8, RNDd; \ roll $8, RAd; \ roll $8, RBd; \ \ last_do16bit_shr(16, xor, RC, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \ last_do16bit( xor, RC, Dssize, Ds0, RA, Ds0, RB, RT0, RT1); \ movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \ roll $8, RNDd; \ xorl RNCd, RCd; \ roll $8, RAd; \ roll $8, RBd; \ roll $8, RCd; \ \ last_do16bit_shr(16, xor, RD, Dssize, Ds0, RND, Ds0, RA, RT0, RT1); \ last_do16bit( xor, RD, Dssize, Ds0, RB, Ds0, RC, RT0, RT1); \ movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \ roll $8, RAd; \ xorl RNDd, RDd; \ roll $16, RBd; \ roll $24, RCd; #define firstdecround(round) \ addroundkey((round + 1), RA, RB, RC, RD); \ do_decround(round); #define decround(round) \ do_decround(round); #define lastdecround(round) \ do_lastdecround(round); .align 8 .globl _gcry_aes_amd64_decrypt_block ELF(.type _gcry_aes_amd64_decrypt_block,@function;) _gcry_aes_amd64_decrypt_block: /* input: * %rdi: keysched, CTX * %rsi: dst * %rdx: src * %ecx: number of rounds.. 10, 12 or 14 * %r8: decryption tables */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 subq $(5 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(5 * 8); movq %rsi, (0 * 8)(%rsp); movl %ecx, (1 * 8)(%rsp); movq %rbp, (2 * 8)(%rsp); movq %rbx, (3 * 8)(%rsp); movq %r12, (4 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 2 * 8); CFI_REL_OFFSET(%rbx, 3 * 8); CFI_REL_OFFSET(%r12, 4 * 8); leaq (%r8), RTAB; /* read input block */ movl 0 * 4(%rdx), RAd; movl 1 * 4(%rdx), RBd; movl 2 * 4(%rdx), RCd; movl 3 * 4(%rdx), RDd; cmpl $12, (1 * 8)(%rsp); jnb .Ldec_256; firstdecround(9); .align 4 .Ldec_tail: decround(8); decround(7); decround(6); decround(5); decround(4); decround(3); decround(2); decround(1); lastdecround(0); /* write output block */ movq (0 * 8)(%rsp), %rsi; movl RAd, 0 * 4(%rsi); movl RBd, 1 * 4(%rsi); movl RCd, 2 * 4(%rsi); movl RDd, 3 * 4(%rsi); CFI_REMEMBER_STATE(); movq (4 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %rbp; CFI_RESTORE(%r12); CFI_RESTORE(%rbx); CFI_RESTORE(%rbp); addq $(5 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-5 * 8); movl $(6 * 8), %eax; EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_RESTORE_STATE(); .align 4 .Ldec_256: je .Ldec_192; firstdecround(13); decround(12); decround(11); decround(10); decround(9); jmp .Ldec_tail; .align 4 .Ldec_192: firstdecround(11); decround(10); decround(9); jmp .Ldec_tail; CFI_ENDPROC(); ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;) #endif /*USE_AES*/ #endif /*__x86_64*/ diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S index 8124eb21..b98dca26 100644 --- a/cipher/rijndael-ssse3-amd64-asm.S +++ b/cipher/rijndael-ssse3-amd64-asm.S @@ -1,874 +1,874 @@ /* SSSE3 vector permutation AES for Libgcrypt * Copyright (C) 2014-2017 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * * The code is based on the public domain library libvpaes version 0.5 * available at http://crypto.stanford.edu/vpaes/ and which carries * this notice: * * libvpaes: constant-time SSSE3 AES encryption and decryption. * version 0.5 * * By Mike Hamburg, Stanford University, 2009. Public domain. * I wrote essentially all of this code. I did not write the test * vectors; they are the NIST known answer tests. I hereby release all * the code and documentation here that I wrote into the public domain. * * This is an implementation of AES following my paper, * "Accelerating AES with Vector Permute Instructions * CHES 2009; http://shiftleft.org/papers/vector_aes/ */ #if defined(__x86_64__) #include #if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #include "asm-common-amd64.h" .text ## ## _gcry_aes_ssse3_enc_preload ## ELF(.type _gcry_aes_ssse3_enc_preload,@function) .globl _gcry_aes_ssse3_enc_preload _gcry_aes_ssse3_enc_preload: CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 lea .Laes_consts(%rip), %rax movdqa (%rax), %xmm9 # 0F movdqa .Lk_inv (%rax), %xmm10 # inv movdqa .Lk_inv+16(%rax), %xmm11 # inva movdqa .Lk_sb1 (%rax), %xmm13 # sb1u movdqa .Lk_sb1+16(%rax), %xmm12 # sb1t movdqa .Lk_sb2 (%rax), %xmm15 # sb2u movdqa .Lk_sb2+16(%rax), %xmm14 # sb2t EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload) ## ## _gcry_aes_ssse3_dec_preload ## ELF(.type _gcry_aes_ssse3_dec_preload,@function) .globl _gcry_aes_ssse3_dec_preload _gcry_aes_ssse3_dec_preload: CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 lea .Laes_consts(%rip), %rax movdqa (%rax), %xmm9 # 0F movdqa .Lk_inv (%rax), %xmm10 # inv movdqa .Lk_inv+16(%rax), %xmm11 # inva movdqa .Lk_dsb9 (%rax), %xmm13 # sb9u movdqa .Lk_dsb9+16(%rax), %xmm12 # sb9t movdqa .Lk_dsbd (%rax), %xmm15 # sbdu movdqa .Lk_dsbb (%rax), %xmm14 # sbbu movdqa .Lk_dsbe (%rax), %xmm8 # sbeu EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload) ## ## Constant-time SSSE3 AES core implementation. ## ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## ## ## _aes_encrypt_core ## ## AES-encrypt %xmm0. ## ## Inputs: ## %xmm0 = input ## %xmm9-%xmm15 as in .Laes_preheat ## (%rdi) = scheduled keys ## %rsi = nrounds ## ## Output in %xmm0 ## Clobbers %xmm1-%xmm4, %r9, %r11, %rax, %rcx, %rdx ## Preserves %xmm6 - %xmm7 so you get some local vectors ## ## .align 16 ELF(.type _gcry_aes_ssse3_encrypt_core,@function) .globl _gcry_aes_ssse3_encrypt_core _gcry_aes_ssse3_encrypt_core: _aes_encrypt_core: CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 mov %rdi, %rdx leaq -1(%rsi), %rax lea .Laes_consts(%rip), %rcx leaq .Lk_mc_backward(%rcx), %rdi mov $16, %rsi movdqa .Lk_ipt (%rcx), %xmm2 # iptlo movdqa %xmm9, %xmm1 pandn %xmm0, %xmm1 psrld $4, %xmm1 pand %xmm9, %xmm0 pshufb %xmm0, %xmm2 movdqa .Lk_ipt+16(%rcx), %xmm0 # ipthi pshufb %xmm1, %xmm0 pxor (%rdx),%xmm2 pxor %xmm2, %xmm0 add $16, %rdx jmp .Laes_entry .align 8 .Laes_loop: # middle of middle round movdqa %xmm13, %xmm4 # 4 : sb1u pshufb %xmm2, %xmm4 # 4 = sb1u pxor (%rdx), %xmm4 # 4 = sb1u + k movdqa %xmm12, %xmm0 # 0 : sb1t pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm4, %xmm0 # 0 = A movdqa %xmm15, %xmm4 # 4 : sb2u pshufb %xmm2, %xmm4 # 4 = sb2u movdqa .Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1 movdqa %xmm14, %xmm2 # 2 : sb2t pshufb %xmm3, %xmm2 # 2 = sb2t pxor %xmm4, %xmm2 # 2 = 2A movdqa %xmm0, %xmm3 # 3 = A pshufb %xmm1, %xmm0 # 0 = B pxor %xmm2, %xmm0 # 0 = 2A+B pshufb (%rsi,%rdi), %xmm3 # 3 = D lea 16(%esi),%esi # next mc pxor %xmm0, %xmm3 # 3 = 2A+B+D lea 16(%rdx),%rdx # next key pshufb %xmm1, %xmm0 # 0 = 2B+C pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D and $48, %rsi # ... mod 4 dec %rax # nr-- .Laes_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i pandn %xmm0, %xmm1 # 1 = i<<4 psrld $4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak pshufb %xmm4, %xmm3 # 3 = 1/jak pxor %xmm1, %xmm3 # 3 = jo jnz .Laes_loop # middle of last round movdqa .Lk_sbo(%rcx), %xmm4 # 3 : sbou pshufb %xmm2, %xmm4 # 4 = sbou pxor (%rdx), %xmm4 # 4 = sb1u + k movdqa .Lk_sbo+16(%rcx), %xmm0 # 0 : sbot pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm4, %xmm0 # 0 = A pshufb .Lk_sr(%rsi,%rcx), %xmm0 EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _aes_encrypt_core,.-_aes_encrypt_core) ## ## Decryption core ## ## Same API as encryption core. ## .align 16 .globl _gcry_aes_ssse3_decrypt_core ELF(.type _gcry_aes_ssse3_decrypt_core,@function) _gcry_aes_ssse3_decrypt_core: _aes_decrypt_core: CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 mov %rdi, %rdx lea .Laes_consts(%rip), %rcx subl $1, %esi movl %esi, %eax shll $4, %esi xorl $48, %esi andl $48, %esi movdqa .Lk_dipt (%rcx), %xmm2 # iptlo movdqa %xmm9, %xmm1 pandn %xmm0, %xmm1 psrld $4, %xmm1 pand %xmm9, %xmm0 pshufb %xmm0, %xmm2 movdqa .Lk_dipt+16(%rcx), %xmm0 # ipthi pshufb %xmm1, %xmm0 pxor (%rdx), %xmm2 pxor %xmm2, %xmm0 movdqa .Lk_mc_forward+48(%rcx), %xmm5 lea 16(%rdx), %rdx neg %rax jmp .Laes_dec_entry .align 16 .Laes_dec_loop: ## ## Inverse mix columns ## movdqa %xmm13, %xmm4 # 4 : sb9u pshufb %xmm2, %xmm4 # 4 = sb9u pxor (%rdx), %xmm4 movdqa %xmm12, %xmm0 # 0 : sb9t pshufb %xmm3, %xmm0 # 0 = sb9t movdqa .Lk_dsbd+16(%rcx),%xmm1 # 1 : sbdt pxor %xmm4, %xmm0 # 0 = ch lea 16(%rdx), %rdx # next round key pshufb %xmm5, %xmm0 # MC ch movdqa %xmm15, %xmm4 # 4 : sbdu pshufb %xmm2, %xmm4 # 4 = sbdu pxor %xmm0, %xmm4 # 4 = ch pshufb %xmm3, %xmm1 # 1 = sbdt pxor %xmm4, %xmm1 # 1 = ch pshufb %xmm5, %xmm1 # MC ch movdqa %xmm14, %xmm4 # 4 : sbbu pshufb %xmm2, %xmm4 # 4 = sbbu inc %rax # nr-- pxor %xmm1, %xmm4 # 4 = ch movdqa .Lk_dsbb+16(%rcx),%xmm0 # 0 : sbbt pshufb %xmm3, %xmm0 # 0 = sbbt pxor %xmm4, %xmm0 # 0 = ch pshufb %xmm5, %xmm0 # MC ch movdqa %xmm8, %xmm4 # 4 : sbeu pshufb %xmm2, %xmm4 # 4 = sbeu pshufd $0x93, %xmm5, %xmm5 pxor %xmm0, %xmm4 # 4 = ch movdqa .Lk_dsbe+16(%rcx),%xmm0 # 0 : sbet pshufb %xmm3, %xmm0 # 0 = sbet pxor %xmm4, %xmm0 # 0 = ch .Laes_dec_entry: # top of round movdqa %xmm9, %xmm1 # 1 : i pandn %xmm0, %xmm1 # 1 = i<<4 psrld $4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak pshufb %xmm4, %xmm3 # 3 = 1/jak pxor %xmm1, %xmm3 # 3 = jo jnz .Laes_dec_loop # middle of last round movdqa .Lk_dsbo(%rcx), %xmm4 # 3 : sbou pshufb %xmm2, %xmm4 # 4 = sbou pxor (%rdx), %xmm4 # 4 = sb1u + k movdqa .Lk_dsbo+16(%rcx), %xmm0 # 0 : sbot pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm4, %xmm0 # 0 = A pshufb .Lk_sr(%rsi,%rcx), %xmm0 EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _aes_decrypt_core,.-_aes_decrypt_core) ######################################################## ## ## ## AES key schedule ## ## ## ######################################################## .align 16 .globl _gcry_aes_ssse3_schedule_core ELF(.type _gcry_aes_ssse3_schedule_core,@function) _gcry_aes_ssse3_schedule_core: _aes_schedule_core: # rdi = key # rsi = size in bits # rdx = buffer # rcx = direction. 0=encrypt, 1=decrypt # r8 = rotoffs CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 # load the tables lea .Laes_consts(%rip), %r10 movdqa (%r10), %xmm9 # 0F movdqa .Lk_inv (%r10), %xmm10 # inv movdqa .Lk_inv+16(%r10), %xmm11 # inva movdqa .Lk_sb1 (%r10), %xmm13 # sb1u movdqa .Lk_sb1+16(%r10), %xmm12 # sb1t movdqa .Lk_sb2 (%r10), %xmm15 # sb2u movdqa .Lk_sb2+16(%r10), %xmm14 # sb2t movdqa .Lk_rcon(%r10), %xmm8 # load rcon movdqu (%rdi), %xmm0 # load key (unaligned) # input transform movdqu %xmm0, %xmm3 lea .Lk_ipt(%r10), %r11 call .Laes_schedule_transform movdqu %xmm0, %xmm7 test %rcx, %rcx jnz .Laes_schedule_am_decrypting # encrypting, output zeroth round key after transform movdqa %xmm0, (%rdx) jmp .Laes_schedule_go .Laes_schedule_am_decrypting: # decrypting, output zeroth round key after shiftrows pshufb .Lk_sr(%r8,%r10),%xmm3 movdqa %xmm3, (%rdx) xor $48, %r8 .Laes_schedule_go: cmp $192, %rsi je .Laes_schedule_192 cmp $256, %rsi je .Laes_schedule_256 # 128: fall though ## ## .Laes_schedule_128 ## ## 128-bit specific part of key schedule. ## ## This schedule is really simple, because all its parts ## are accomplished by the subroutines. ## .Laes_schedule_128: mov $10, %rsi .Laes_schedule_128_L: call .Laes_schedule_round dec %rsi jz .Laes_schedule_mangle_last call .Laes_schedule_mangle # write output jmp .Laes_schedule_128_L ## ## .Laes_schedule_192 ## ## 192-bit specific part of key schedule. ## ## The main body of this schedule is the same as the 128-bit ## schedule, but with more smearing. The long, high side is ## stored in %xmm7 as before, and the short, low side is in ## the high bits of %xmm6. ## ## This schedule is somewhat nastier, however, because each ## round produces 192 bits of key material, or 1.5 round keys. ## Therefore, on each cycle we do 2 rounds and produce 3 round ## keys. ## .Laes_schedule_192: movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) call .Laes_schedule_transform # input transform pshufd $0x0E, %xmm0, %xmm6 pslldq $8, %xmm6 # clobber low side with zeros mov $4, %rsi .Laes_schedule_192_L: call .Laes_schedule_round palignr $8,%xmm6,%xmm0 call .Laes_schedule_mangle # save key n call .Laes_schedule_192_smear call .Laes_schedule_mangle # save key n+1 call .Laes_schedule_round dec %rsi jz .Laes_schedule_mangle_last call .Laes_schedule_mangle # save key n+2 call .Laes_schedule_192_smear jmp .Laes_schedule_192_L ## ## .Laes_schedule_192_smear ## ## Smear the short, low side in the 192-bit key schedule. ## ## Inputs: ## %xmm7: high side, b a x y ## %xmm6: low side, d c 0 0 ## %xmm13: 0 ## ## Outputs: ## %xmm6: b+c+d b+c 0 0 ## %xmm0: b+c+d b+c b a ## .Laes_schedule_192_smear: pshufd $0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 pxor %xmm0, %xmm6 # -> c+d c 0 0 pshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a pxor %xmm6, %xmm0 # -> b+c+d b+c b a pshufd $0x0E, %xmm0, %xmm6 pslldq $8, %xmm6 # clobber low side with zeros - ret + ret_spec_stop ## ## .Laes_schedule_256 ## ## 256-bit specific part of key schedule. ## ## The structure here is very similar to the 128-bit ## schedule, but with an additional 'low side' in ## %xmm6. The low side's rounds are the same as the ## high side's, except no rcon and no rotation. ## .Laes_schedule_256: movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) call .Laes_schedule_transform # input transform mov $7, %rsi .Laes_schedule_256_L: call .Laes_schedule_mangle # output low result movdqa %xmm0, %xmm6 # save cur_lo in xmm6 # high round call .Laes_schedule_round dec %rsi jz .Laes_schedule_mangle_last call .Laes_schedule_mangle # low round. swap xmm7 and xmm6 pshufd $0xFF, %xmm0, %xmm0 movdqa %xmm7, %xmm5 movdqa %xmm6, %xmm7 call .Laes_schedule_low_round movdqa %xmm5, %xmm7 jmp .Laes_schedule_256_L ## ## .Laes_schedule_round ## ## Runs one main round of the key schedule on %xmm0, %xmm7 ## ## Specifically, runs subbytes on the high dword of %xmm0 ## then rotates it by one byte and xors into the low dword of ## %xmm7. ## ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for ## next rcon. ## ## Smears the dwords of %xmm7 by xoring the low into the ## second low, result into third, result into highest. ## ## Returns results in %xmm7 = %xmm0. ## Clobbers %xmm1-%xmm4, %r11. ## .Laes_schedule_round: # extract rcon from xmm8 pxor %xmm1, %xmm1 palignr $15, %xmm8, %xmm1 palignr $15, %xmm8, %xmm8 pxor %xmm1, %xmm7 # rotate pshufd $0xFF, %xmm0, %xmm0 palignr $1, %xmm0, %xmm0 # fall through... # low round: same as high round, but no rotation and no rcon. .Laes_schedule_low_round: # smear xmm7 movdqa %xmm7, %xmm1 pslldq $4, %xmm7 pxor %xmm1, %xmm7 movdqa %xmm7, %xmm1 pslldq $8, %xmm7 pxor %xmm1, %xmm7 pxor .Lk_s63(%r10), %xmm7 # subbytes movdqa %xmm9, %xmm1 pandn %xmm0, %xmm1 psrld $4, %xmm1 # 1 = i pand %xmm9, %xmm0 # 0 = k movdqa %xmm11, %xmm2 # 2 : a/k pshufb %xmm0, %xmm2 # 2 = a/k pxor %xmm1, %xmm0 # 0 = j movdqa %xmm10, %xmm3 # 3 : 1/i pshufb %xmm1, %xmm3 # 3 = 1/i pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k movdqa %xmm10, %xmm4 # 4 : 1/j pshufb %xmm0, %xmm4 # 4 = 1/j pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k movdqa %xmm10, %xmm2 # 2 : 1/iak pshufb %xmm3, %xmm2 # 2 = 1/iak pxor %xmm0, %xmm2 # 2 = io movdqa %xmm10, %xmm3 # 3 : 1/jak pshufb %xmm4, %xmm3 # 3 = 1/jak pxor %xmm1, %xmm3 # 3 = jo movdqa .Lk_sb1(%r10), %xmm4 # 4 : sbou pshufb %xmm2, %xmm4 # 4 = sbou movdqa .Lk_sb1+16(%r10), %xmm0 # 0 : sbot pshufb %xmm3, %xmm0 # 0 = sb1t pxor %xmm4, %xmm0 # 0 = sbox output # add in smeared stuff pxor %xmm7, %xmm0 movdqa %xmm0, %xmm7 - ret + ret_spec_stop ## ## .Laes_schedule_transform ## ## Linear-transform %xmm0 according to tables at (%r11) ## ## Requires that %xmm9 = 0x0F0F... as in preheat ## Output in %xmm0 ## Clobbers %xmm1, %xmm2 ## .Laes_schedule_transform: movdqa %xmm9, %xmm1 pandn %xmm0, %xmm1 psrld $4, %xmm1 pand %xmm9, %xmm0 movdqa (%r11), %xmm2 # lo pshufb %xmm0, %xmm2 movdqa 16(%r11), %xmm0 # hi pshufb %xmm1, %xmm0 pxor %xmm2, %xmm0 - ret + ret_spec_stop ## ## .Laes_schedule_mangle ## ## Mangle xmm0 from (basis-transformed) standard version ## to our version. ## ## On encrypt, ## xor with 0x63 ## multiply by circulant 0,1,1,1 ## apply shiftrows transform ## ## On decrypt, ## xor with 0x63 ## multiply by 'inverse mixcolumns' circulant E,B,D,9 ## deskew ## apply shiftrows transform ## ## ## Writes out to (%rdx), and increments or decrements it ## Keeps track of round number mod 4 in %r8 ## Preserves xmm0 ## Clobbers xmm1-xmm5 ## .Laes_schedule_mangle: movdqa %xmm0, %xmm4 # save xmm0 for later movdqa .Lk_mc_forward(%r10),%xmm5 test %rcx, %rcx jnz .Laes_schedule_mangle_dec # encrypting add $16, %rdx pxor .Lk_s63(%r10),%xmm4 pshufb %xmm5, %xmm4 movdqa %xmm4, %xmm3 pshufb %xmm5, %xmm4 pxor %xmm4, %xmm3 pshufb %xmm5, %xmm4 pxor %xmm4, %xmm3 jmp .Laes_schedule_mangle_both .Laes_schedule_mangle_dec: lea .Lk_dks_1(%r10), %r11 # first table: *9 call .Laes_schedule_transform movdqa %xmm0, %xmm3 pshufb %xmm5, %xmm3 add $32, %r11 # next table: *B call .Laes_schedule_transform pxor %xmm0, %xmm3 pshufb %xmm5, %xmm3 add $32, %r11 # next table: *D call .Laes_schedule_transform pxor %xmm0, %xmm3 pshufb %xmm5, %xmm3 add $32, %r11 # next table: *E call .Laes_schedule_transform pxor %xmm0, %xmm3 pshufb %xmm5, %xmm3 movdqa %xmm4, %xmm0 # restore %xmm0 add $-16, %rdx .Laes_schedule_mangle_both: pshufb .Lk_sr(%r8,%r10),%xmm3 add $-16, %r8 and $48, %r8 movdqa %xmm3, (%rdx) - ret + ret_spec_stop ## ## .Laes_schedule_mangle_last ## ## Mangler for last round of key schedule ## Mangles %xmm0 ## when encrypting, outputs out(%xmm0) ^ 63 ## when decrypting, outputs unskew(%xmm0) ## ## Always called right before return... jumps to cleanup and exits ## .Laes_schedule_mangle_last: # schedule last round key from xmm0 lea .Lk_deskew(%r10),%r11 # prepare to deskew test %rcx, %rcx jnz .Laes_schedule_mangle_last_dec # encrypting pshufb .Lk_sr(%r8,%r10),%xmm0 # output permute lea .Lk_opt(%r10), %r11 # prepare to output transform add $32, %rdx .Laes_schedule_mangle_last_dec: add $-16, %rdx pxor .Lk_s63(%r10), %xmm0 call .Laes_schedule_transform # output transform movdqa %xmm0, (%rdx) # save last key #_aes_cleanup pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 pxor %xmm8, %xmm8 EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core) ######################################################## ## ## ## Constants ## ## ## ######################################################## .align 16 ELF(.type _aes_consts,@object) .Laes_consts: _aes_consts: # s0F .Lk_s0F = .-.Laes_consts .quad 0x0F0F0F0F0F0F0F0F .quad 0x0F0F0F0F0F0F0F0F # input transform (lo, hi) .Lk_ipt = .-.Laes_consts .quad 0xC2B2E8985A2A7000 .quad 0xCABAE09052227808 .quad 0x4C01307D317C4D00 .quad 0xCD80B1FCB0FDCC81 # inv, inva .Lk_inv = .-.Laes_consts .quad 0x0E05060F0D080180 .quad 0x040703090A0B0C02 .quad 0x01040A060F0B0780 .quad 0x030D0E0C02050809 # sb1u, sb1t .Lk_sb1 = .-.Laes_consts .quad 0xB19BE18FCB503E00 .quad 0xA5DF7A6E142AF544 .quad 0x3618D415FAE22300 .quad 0x3BF7CCC10D2ED9EF # sb2u, sb2t .Lk_sb2 = .-.Laes_consts .quad 0xE27A93C60B712400 .quad 0x5EB7E955BC982FCD .quad 0x69EB88400AE12900 .quad 0xC2A163C8AB82234A # sbou, sbot .Lk_sbo = .-.Laes_consts .quad 0xD0D26D176FBDC700 .quad 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00 .quad 0x8E1E90D1412B35FA # mc_forward .Lk_mc_forward = .-.Laes_consts .quad 0x0407060500030201 .quad 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605 .quad 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09 .quad 0x0407060500030201 .quad 0x000302010C0F0E0D .quad 0x080B0A0904070605 # mc_backward .Lk_mc_backward = .-.Laes_consts .quad 0x0605040702010003 .quad 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F .quad 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B .quad 0x0605040702010003 .quad 0x0A09080B06050407 .quad 0x020100030E0D0C0F # sr .Lk_sr = .-.Laes_consts .quad 0x0706050403020100 .quad 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500 .quad 0x0B06010C07020D08 .quad 0x0F060D040B020900 .quad 0x070E050C030A0108 .quad 0x0B0E0104070A0D00 .quad 0x0306090C0F020508 # rcon .Lk_rcon = .-.Laes_consts .quad 0x1F8391B9AF9DEEB6 .quad 0x702A98084D7C7D81 # s63: all equal to 0x63 transformed .Lk_s63 = .-.Laes_consts .quad 0x5B5B5B5B5B5B5B5B .quad 0x5B5B5B5B5B5B5B5B # output transform .Lk_opt = .-.Laes_consts .quad 0xFF9F4929D6B66000 .quad 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00 .quad 0xE10D5DB1B05C0CE0 # deskew tables: inverts the sbox's 'skew' .Lk_deskew = .-.Laes_consts .quad 0x07E4A34047A4E300 .quad 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900 .quad 0x2841C2ABF49D1E77 ## ## Decryption stuff ## Key schedule constants ## # decryption key schedule: x -> invskew x*9 .Lk_dks_1 = .-.Laes_consts .quad 0xB6116FC87ED9A700 .quad 0x4AED933482255BFC .quad 0x4576516227143300 .quad 0x8BB89FACE9DAFDCE # decryption key schedule: invskew x*9 -> invskew x*D .Lk_dks_2 = .-.Laes_consts .quad 0x27438FEBCCA86400 .quad 0x4622EE8AADC90561 .quad 0x815C13CE4F92DD00 .quad 0x73AEE13CBD602FF2 # decryption key schedule: invskew x*D -> invskew x*B .Lk_dks_3 = .-.Laes_consts .quad 0x03C4C50201C6C700 .quad 0xF83F3EF9FA3D3CFB .quad 0xEE1921D638CFF700 .quad 0xA5526A9D7384BC4B # decryption key schedule: invskew x*B -> invskew x*E + 0x63 .Lk_dks_4 = .-.Laes_consts .quad 0xE3C390B053732000 .quad 0xA080D3F310306343 .quad 0xA0CA214B036982E8 .quad 0x2F45AEC48CE60D67 ## ## Decryption stuff ## Round function constants ## # decryption input transform .Lk_dipt = .-.Laes_consts .quad 0x0F505B040B545F00 .quad 0x154A411E114E451A .quad 0x86E383E660056500 .quad 0x12771772F491F194 # decryption sbox output *9*u, *9*t .Lk_dsb9 = .-.Laes_consts .quad 0x851C03539A86D600 .quad 0xCAD51F504F994CC9 .quad 0xC03B1789ECD74900 .quad 0x725E2C9EB2FBA565 # decryption sbox output *D*u, *D*t .Lk_dsbd = .-.Laes_consts .quad 0x7D57CCDFE6B1A200 .quad 0xF56E9B13882A4439 .quad 0x3CE2FAF724C6CB00 .quad 0x2931180D15DEEFD3 # decryption sbox output *B*u, *B*t .Lk_dsbb = .-.Laes_consts .quad 0xD022649296B44200 .quad 0x602646F6B0F2D404 .quad 0xC19498A6CD596700 .quad 0xF3FF0C3E3255AA6B # decryption sbox output *E*u, *E*t .Lk_dsbe = .-.Laes_consts .quad 0x46F2929626D4D000 .quad 0x2242600464B4F6B0 .quad 0x0C55A6CDFFAAC100 .quad 0x9467F36B98593E32 # decryption sbox final output .Lk_dsbo = .-.Laes_consts .quad 0x1387EA537EF94000 .quad 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00 .quad 0xCA4B8159D8C58E9C ELF(.size _aes_consts,.-_aes_consts) #endif #endif diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S index d4ecf59f..f94b58db 100644 --- a/cipher/rijndael-vaes-avx2-amd64.S +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -1,3021 +1,3021 @@ /* VAES/AVX2 AMD64 accelerated AES for Libgcrypt * Copyright (C) 2021 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #if defined(__x86_64__) #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) && \ defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) #include "asm-common-amd64.h" .text /********************************************************************** helper macros **********************************************************************/ #define no(...) /*_*/ #define yes(...) __VA_ARGS__ #define AES_OP8(op, key, b0, b1, b2, b3, b4, b5, b6, b7) \ op key, b0, b0; \ op key, b1, b1; \ op key, b2, b2; \ op key, b3, b3; \ op key, b4, b4; \ op key, b5, b5; \ op key, b6, b6; \ op key, b7, b7; #define VAESENC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \ AES_OP8(vaesenc, key, b0, b1, b2, b3, b4, b5, b6, b7) #define VAESDEC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \ AES_OP8(vaesdec, key, b0, b1, b2, b3, b4, b5, b6, b7) #define XOR8(key, b0, b1, b2, b3, b4, b5, b6, b7) \ AES_OP8(vpxor, key, b0, b1, b2, b3, b4, b5, b6, b7) #define AES_OP4(op, key, b0, b1, b2, b3) \ op key, b0, b0; \ op key, b1, b1; \ op key, b2, b2; \ op key, b3, b3; #define VAESENC4(key, b0, b1, b2, b3) \ AES_OP4(vaesenc, key, b0, b1, b2, b3) #define VAESDEC4(key, b0, b1, b2, b3) \ AES_OP4(vaesdec, key, b0, b1, b2, b3) #define XOR4(key, b0, b1, b2, b3) \ AES_OP4(vpxor, key, b0, b1, b2, b3) #define AES_OP2(op, key, b0, b1) \ op key, b0, b0; \ op key, b1, b1; #define VAESENC2(key, b0, b1) \ AES_OP2(vaesenc, key, b0, b1) #define VAESDEC2(key, b0, b1) \ AES_OP2(vaesdec, key, b0, b1) #define XOR2(key, b0, b1) \ AES_OP2(vpxor, key, b0, b1) /********************************************************************** CBC-mode decryption **********************************************************************/ ELF(.type _gcry_vaes_avx2_cbc_dec_amd64,@function) .globl _gcry_vaes_avx2_cbc_dec_amd64 _gcry_vaes_avx2_cbc_dec_amd64: /* input: * %rdi: round keys * %rsi: iv * %rdx: dst * %rcx: src * %r8: nblocks * %r9: nrounds */ CFI_STARTPROC(); /* Load IV. */ vmovdqu (%rsi), %xmm15; /* Process 16 blocks per loop. */ .align 8 .Lcbc_dec_blk16: cmpq $16, %r8; jb .Lcbc_dec_blk8; leaq -16(%r8), %r8; /* Load input and xor first key. Update IV. */ vbroadcasti128 (0 * 16)(%rdi), %ymm8; vmovdqu (0 * 16)(%rcx), %ymm0; vmovdqu (2 * 16)(%rcx), %ymm1; vmovdqu (4 * 16)(%rcx), %ymm2; vmovdqu (6 * 16)(%rcx), %ymm3; vmovdqu (8 * 16)(%rcx), %ymm4; vmovdqu (10 * 16)(%rcx), %ymm5; vmovdqu (12 * 16)(%rcx), %ymm6; vmovdqu (14 * 16)(%rcx), %ymm7; vpxor %ymm8, %ymm0, %ymm0; vpxor %ymm8, %ymm1, %ymm1; vpxor %ymm8, %ymm2, %ymm2; vpxor %ymm8, %ymm3, %ymm3; vpxor %ymm8, %ymm4, %ymm4; vpxor %ymm8, %ymm5, %ymm5; vpxor %ymm8, %ymm6, %ymm6; vpxor %ymm8, %ymm7, %ymm7; vbroadcasti128 (1 * 16)(%rdi), %ymm8; vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm9; vmovdqu (1 * 16)(%rcx), %ymm10; vmovdqu (3 * 16)(%rcx), %ymm11; vmovdqu (5 * 16)(%rcx), %ymm12; vmovdqu (7 * 16)(%rcx), %ymm13; vmovdqu (9 * 16)(%rcx), %ymm14; vmovdqu (15 * 16)(%rcx), %xmm15; leaq (16 * 16)(%rcx), %rcx; /* AES rounds */ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (2 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (3 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (4 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (5 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (6 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (7 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (8 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (9 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (10 * 16)(%rdi), %ymm8; cmpl $12, %r9d; jb .Lcbc_dec_blk16_last; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (11 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (12 * 16)(%rdi), %ymm8; jz .Lcbc_dec_blk16_last; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (13 * 16)(%rdi), %ymm8; VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (14 * 16)(%rdi), %ymm8; /* Last round and output handling. */ .Lcbc_dec_blk16_last: vpxor %ymm8, %ymm9, %ymm9; vpxor %ymm8, %ymm10, %ymm10; vpxor %ymm8, %ymm11, %ymm11; vpxor %ymm8, %ymm12, %ymm12; vpxor %ymm8, %ymm13, %ymm13; vpxor %ymm8, %ymm14, %ymm14; vaesdeclast %ymm9, %ymm0, %ymm0; vaesdeclast %ymm10, %ymm1, %ymm1; vpxor (-5 * 16)(%rcx), %ymm8, %ymm9; vpxor (-3 * 16)(%rcx), %ymm8, %ymm10; vaesdeclast %ymm11, %ymm2, %ymm2; vaesdeclast %ymm12, %ymm3, %ymm3; vaesdeclast %ymm13, %ymm4, %ymm4; vaesdeclast %ymm14, %ymm5, %ymm5; vaesdeclast %ymm9, %ymm6, %ymm6; vaesdeclast %ymm10, %ymm7, %ymm7; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); vmovdqu %ymm4, (8 * 16)(%rdx); vmovdqu %ymm5, (10 * 16)(%rdx); vmovdqu %ymm6, (12 * 16)(%rdx); vmovdqu %ymm7, (14 * 16)(%rdx); leaq (16 * 16)(%rdx), %rdx; jmp .Lcbc_dec_blk16; /* Handle trailing eight blocks. */ .align 8 .Lcbc_dec_blk8: cmpq $8, %r8; jb .Lcbc_dec_blk4; leaq -8(%r8), %r8; /* Load input and xor first key. Update IV. */ vbroadcasti128 (0 * 16)(%rdi), %ymm4; vmovdqu (0 * 16)(%rcx), %ymm0; vmovdqu (2 * 16)(%rcx), %ymm1; vmovdqu (4 * 16)(%rcx), %ymm2; vmovdqu (6 * 16)(%rcx), %ymm3; vpxor %ymm4, %ymm0, %ymm0; vpxor %ymm4, %ymm1, %ymm1; vpxor %ymm4, %ymm2, %ymm2; vpxor %ymm4, %ymm3, %ymm3; vbroadcasti128 (1 * 16)(%rdi), %ymm4; vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10; vmovdqu (1 * 16)(%rcx), %ymm11; vmovdqu (3 * 16)(%rcx), %ymm12; vmovdqu (5 * 16)(%rcx), %ymm13; vmovdqu (7 * 16)(%rcx), %xmm15; leaq (8 * 16)(%rcx), %rcx; /* AES rounds */ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lcbc_dec_blk8_last; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lcbc_dec_blk8_last; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lcbc_dec_blk8_last: vpxor %ymm4, %ymm10, %ymm10; vpxor %ymm4, %ymm11, %ymm11; vpxor %ymm4, %ymm12, %ymm12; vpxor %ymm4, %ymm13, %ymm13; vaesdeclast %ymm10, %ymm0, %ymm0; vaesdeclast %ymm11, %ymm1, %ymm1; vaesdeclast %ymm12, %ymm2, %ymm2; vaesdeclast %ymm13, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; /* Handle trailing four blocks. */ .align 8 .Lcbc_dec_blk4: cmpq $4, %r8; jb .Lcbc_dec_blk1; leaq -4(%r8), %r8; /* Load input and xor first key. Update IV. */ vbroadcasti128 (0 * 16)(%rdi), %ymm4; vmovdqu (0 * 16)(%rcx), %ymm0; vmovdqu (2 * 16)(%rcx), %ymm1; vpxor %ymm4, %ymm0, %ymm0; vpxor %ymm4, %ymm1, %ymm1; vbroadcasti128 (1 * 16)(%rdi), %ymm4; vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10; vmovdqu (1 * 16)(%rcx), %ymm11; vmovdqu (3 * 16)(%rcx), %xmm15; leaq (4 * 16)(%rcx), %rcx; /* AES rounds */ VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lcbc_dec_blk4_last; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lcbc_dec_blk4_last; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lcbc_dec_blk4_last: vpxor %ymm4, %ymm10, %ymm10; vpxor %ymm4, %ymm11, %ymm11; vaesdeclast %ymm10, %ymm0, %ymm0; vaesdeclast %ymm11, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; /* Process trailing one to three blocks, one per loop. */ .align 8 .Lcbc_dec_blk1: cmpq $1, %r8; jb .Ldone_cbc_dec; leaq -1(%r8), %r8; /* Load input. */ vmovdqu (%rcx), %xmm2; leaq 16(%rcx), %rcx; /* Xor first key. */ vpxor (0 * 16)(%rdi), %xmm2, %xmm0; /* AES rounds. */ vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (10 * 16)(%rdi), %xmm1; cmpl $12, %r9d; jb .Lcbc_dec_blk1_last; vaesdec %xmm1, %xmm0, %xmm0; vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (12 * 16)(%rdi), %xmm1; jz .Lcbc_dec_blk1_last; vaesdec %xmm1, %xmm0, %xmm0; vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (14 * 16)(%rdi), %xmm1; /* Last round and output handling. */ .Lcbc_dec_blk1_last: vpxor %xmm1, %xmm15, %xmm15; vaesdeclast %xmm15, %xmm0, %xmm0; vmovdqa %xmm2, %xmm15; vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Lcbc_dec_blk1; .align 8 .Ldone_cbc_dec: /* Store IV. */ vmovdqu %xmm15, (%rsi); vzeroall; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64) /********************************************************************** CFB-mode decryption **********************************************************************/ ELF(.type _gcry_vaes_avx2_cfb_dec_amd64,@function) .globl _gcry_vaes_avx2_cfb_dec_amd64 _gcry_vaes_avx2_cfb_dec_amd64: /* input: * %rdi: round keys * %rsi: iv * %rdx: dst * %rcx: src * %r8: nblocks * %r9: nrounds */ CFI_STARTPROC(); /* Load IV. */ vmovdqu (%rsi), %xmm15; /* Process 16 blocks per loop. */ .align 8 .Lcfb_dec_blk16: cmpq $16, %r8; jb .Lcfb_dec_blk8; leaq -16(%r8), %r8; /* Load input and xor first key. Update IV. */ vbroadcasti128 (0 * 16)(%rdi), %ymm8; vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0; vmovdqu (1 * 16)(%rcx), %ymm1; vmovdqu (3 * 16)(%rcx), %ymm2; vmovdqu (5 * 16)(%rcx), %ymm3; vmovdqu (7 * 16)(%rcx), %ymm4; vmovdqu (9 * 16)(%rcx), %ymm5; vmovdqu (11 * 16)(%rcx), %ymm6; vmovdqu (13 * 16)(%rcx), %ymm7; vmovdqu (15 * 16)(%rcx), %xmm15; vpxor %ymm8, %ymm0, %ymm0; vpxor %ymm8, %ymm1, %ymm1; vpxor %ymm8, %ymm2, %ymm2; vpxor %ymm8, %ymm3, %ymm3; vpxor %ymm8, %ymm4, %ymm4; vpxor %ymm8, %ymm5, %ymm5; vpxor %ymm8, %ymm6, %ymm6; vpxor %ymm8, %ymm7, %ymm7; vbroadcasti128 (1 * 16)(%rdi), %ymm8; vmovdqu (0 * 16)(%rcx), %ymm9; vmovdqu (2 * 16)(%rcx), %ymm10; vmovdqu (4 * 16)(%rcx), %ymm11; vmovdqu (6 * 16)(%rcx), %ymm12; vmovdqu (8 * 16)(%rcx), %ymm13; vmovdqu (10 * 16)(%rcx), %ymm14; leaq (16 * 16)(%rcx), %rcx; /* AES rounds */ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (2 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (3 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (4 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (5 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (6 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (7 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (8 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (9 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (10 * 16)(%rdi), %ymm8; cmpl $12, %r9d; jb .Lcfb_dec_blk16_last; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (11 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (12 * 16)(%rdi), %ymm8; jz .Lcfb_dec_blk16_last; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (13 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (14 * 16)(%rdi), %ymm8; /* Last round and output handling. */ .Lcfb_dec_blk16_last: vpxor %ymm8, %ymm9, %ymm9; vpxor %ymm8, %ymm10, %ymm10; vpxor %ymm8, %ymm11, %ymm11; vpxor %ymm8, %ymm12, %ymm12; vpxor %ymm8, %ymm13, %ymm13; vpxor %ymm8, %ymm14, %ymm14; vaesenclast %ymm9, %ymm0, %ymm0; vaesenclast %ymm10, %ymm1, %ymm1; vpxor (-4 * 16)(%rcx), %ymm8, %ymm9; vpxor (-2 * 16)(%rcx), %ymm8, %ymm10; vaesenclast %ymm11, %ymm2, %ymm2; vaesenclast %ymm12, %ymm3, %ymm3; vaesenclast %ymm13, %ymm4, %ymm4; vaesenclast %ymm14, %ymm5, %ymm5; vaesenclast %ymm9, %ymm6, %ymm6; vaesenclast %ymm10, %ymm7, %ymm7; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); vmovdqu %ymm4, (8 * 16)(%rdx); vmovdqu %ymm5, (10 * 16)(%rdx); vmovdqu %ymm6, (12 * 16)(%rdx); vmovdqu %ymm7, (14 * 16)(%rdx); leaq (16 * 16)(%rdx), %rdx; jmp .Lcfb_dec_blk16; /* Handle trailing eight blocks. */ .align 8 .Lcfb_dec_blk8: cmpq $8, %r8; jb .Lcfb_dec_blk4; leaq -8(%r8), %r8; /* Load input and xor first key. Update IV. */ vbroadcasti128 (0 * 16)(%rdi), %ymm4; vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0; vmovdqu (1 * 16)(%rcx), %ymm1; vmovdqu (3 * 16)(%rcx), %ymm2; vmovdqu (5 * 16)(%rcx), %ymm3; vmovdqu (7 * 16)(%rcx), %xmm15; vpxor %ymm4, %ymm0, %ymm0; vpxor %ymm4, %ymm1, %ymm1; vpxor %ymm4, %ymm2, %ymm2; vpxor %ymm4, %ymm3, %ymm3; vbroadcasti128 (1 * 16)(%rdi), %ymm4; vmovdqu (0 * 16)(%rcx), %ymm10; vmovdqu (2 * 16)(%rcx), %ymm11; vmovdqu (4 * 16)(%rcx), %ymm12; vmovdqu (6 * 16)(%rcx), %ymm13; leaq (8 * 16)(%rcx), %rcx; /* AES rounds */ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lcfb_dec_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lcfb_dec_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lcfb_dec_blk8_last: vpxor %ymm4, %ymm10, %ymm10; vpxor %ymm4, %ymm11, %ymm11; vpxor %ymm4, %ymm12, %ymm12; vpxor %ymm4, %ymm13, %ymm13; vaesenclast %ymm10, %ymm0, %ymm0; vaesenclast %ymm11, %ymm1, %ymm1; vaesenclast %ymm12, %ymm2, %ymm2; vaesenclast %ymm13, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; /* Handle trailing four blocks. */ .align 8 .Lcfb_dec_blk4: cmpq $4, %r8; jb .Lcfb_dec_blk1; leaq -4(%r8), %r8; /* Load input and xor first key. Update IV. */ vbroadcasti128 (0 * 16)(%rdi), %ymm4; vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0; vmovdqu (1 * 16)(%rcx), %ymm1; vmovdqu (3 * 16)(%rcx), %xmm15; vpxor %ymm4, %ymm0, %ymm0; vpxor %ymm4, %ymm1, %ymm1; vbroadcasti128 (1 * 16)(%rdi), %ymm4; vmovdqu (0 * 16)(%rcx), %ymm10; vmovdqu (2 * 16)(%rcx), %ymm11; leaq (4 * 16)(%rcx), %rcx; /* AES rounds */ VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lcfb_dec_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lcfb_dec_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lcfb_dec_blk4_last: vpxor %ymm4, %ymm10, %ymm10; vpxor %ymm4, %ymm11, %ymm11; vaesenclast %ymm10, %ymm0, %ymm0; vaesenclast %ymm11, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; /* Process trailing one to three blocks, one per loop. */ .align 8 .Lcfb_dec_blk1: cmpq $1, %r8; jb .Ldone_cfb_dec; leaq -1(%r8), %r8; /* Xor first key. */ vpxor (0 * 16)(%rdi), %xmm15, %xmm0; /* Load input as next IV. */ vmovdqu (%rcx), %xmm15; leaq 16(%rcx), %rcx; /* AES rounds. */ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (10 * 16)(%rdi), %xmm1; cmpl $12, %r9d; jb .Lcfb_dec_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (12 * 16)(%rdi), %xmm1; jz .Lcfb_dec_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (14 * 16)(%rdi), %xmm1; /* Last round and output handling. */ .Lcfb_dec_blk1_last: vpxor %xmm15, %xmm1, %xmm1; vaesenclast %xmm1, %xmm0, %xmm0; vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Lcfb_dec_blk1; .align 8 .Ldone_cfb_dec: /* Store IV. */ vmovdqu %xmm15, (%rsi); vzeroall; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64) /********************************************************************** CTR-mode encryption **********************************************************************/ ELF(.type _gcry_vaes_avx2_ctr_enc_amd64,@function) .globl _gcry_vaes_avx2_ctr_enc_amd64 _gcry_vaes_avx2_ctr_enc_amd64: /* input: * %rdi: round keys * %rsi: counter * %rdx: dst * %rcx: src * %r8: nblocks * %r9: nrounds */ CFI_STARTPROC(); movq 8(%rsi), %r10; movq 0(%rsi), %r11; bswapq %r10; bswapq %r11; vpcmpeqd %ymm15, %ymm15, %ymm15; vpsrldq $8, %ymm15, %ymm15; // 0:-1 vpaddq %ymm15, %ymm15, %ymm14; // 0:-2 vbroadcasti128 .Lbswap128_mask rRIP, %ymm13; #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ vpcmpeqq minus_one, x, tmp1; \ vpcmpeqq minus_two, x, tmp2; \ vpor tmp1, tmp2, tmp2; \ vpsubq minus_two, x, x; \ vpslldq $8, tmp2, tmp2; \ vpsubq tmp2, x, x; /* Process 16 blocks per loop. */ .align 8 .Lctr_enc_blk16: cmpq $16, %r8; jb .Lctr_enc_blk8; leaq -16(%r8), %r8; vbroadcasti128 (%rsi), %ymm7; vbroadcasti128 (0 * 16)(%rdi), %ymm8; /* detect if carry handling is needed */ addb $16, 15(%rsi); jc .Lctr_enc_blk16_handle_carry; /* Increment counters. */ vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0; vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1; vpaddb .Lbige_addb_4 rRIP, %ymm7, %ymm2; vpaddb .Lbige_addb_6 rRIP, %ymm7, %ymm3; vpaddb .Lbige_addb_8 rRIP, %ymm7, %ymm4; vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5; vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6; vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7; leaq 16(%r10), %r10; .Lctr_enc_blk16_rounds: /* AES rounds */ XOR8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (1 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (2 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (3 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (4 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (5 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (6 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (7 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (8 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (9 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (10 * 16)(%rdi), %ymm8; cmpl $12, %r9d; jb .Lctr_enc_blk16_last; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (11 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (12 * 16)(%rdi), %ymm8; jz .Lctr_enc_blk16_last; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (13 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (14 * 16)(%rdi), %ymm8; /* Last round and output handling. */ .Lctr_enc_blk16_last: vpxor (0 * 16)(%rcx), %ymm8, %ymm9; /* Xor src to last round key. */ vpxor (2 * 16)(%rcx), %ymm8, %ymm10; vpxor (4 * 16)(%rcx), %ymm8, %ymm11; vpxor (6 * 16)(%rcx), %ymm8, %ymm12; vaesenclast %ymm9, %ymm0, %ymm0; vaesenclast %ymm10, %ymm1, %ymm1; vaesenclast %ymm11, %ymm2, %ymm2; vaesenclast %ymm12, %ymm3, %ymm3; vpxor (8 * 16)(%rcx), %ymm8, %ymm9; vpxor (10 * 16)(%rcx), %ymm8, %ymm10; vpxor (12 * 16)(%rcx), %ymm8, %ymm11; vpxor (14 * 16)(%rcx), %ymm8, %ymm8; leaq (16 * 16)(%rcx), %rcx; vaesenclast %ymm9, %ymm4, %ymm4; vaesenclast %ymm10, %ymm5, %ymm5; vaesenclast %ymm11, %ymm6, %ymm6; vaesenclast %ymm8, %ymm7, %ymm7; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); vmovdqu %ymm4, (8 * 16)(%rdx); vmovdqu %ymm5, (10 * 16)(%rdx); vmovdqu %ymm6, (12 * 16)(%rdx); vmovdqu %ymm7, (14 * 16)(%rdx); leaq (16 * 16)(%rdx), %rdx; jmp .Lctr_enc_blk16; .align 8 .Lctr_enc_blk16_handle_carry: /* Increment counters (handle carry). */ vpshufb %xmm13, %xmm7, %xmm1; /* be => le */ vmovdqa %xmm1, %xmm0; inc_le128(%xmm1, %xmm15, %xmm5); vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */ vpshufb %ymm13, %ymm7, %ymm0; addq $16, %r10; adcq $0, %r11; bswapq %r10; bswapq %r11; movq %r10, 8(%rsi); movq %r11, 0(%rsi); bswapq %r10; bswapq %r11; add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */ vpshufb %ymm13, %ymm7, %ymm1; add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */ vpshufb %ymm13, %ymm7, %ymm2; add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +7:+6 */ vpshufb %ymm13, %ymm7, %ymm3; add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +9:+8 */ vpshufb %ymm13, %ymm7, %ymm4; add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +11:+10 */ vpshufb %ymm13, %ymm7, %ymm5; add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +13:+12 */ vpshufb %ymm13, %ymm7, %ymm6; add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +15:+14 */ vpshufb %ymm13, %ymm7, %ymm7; jmp .Lctr_enc_blk16_rounds; /* Handle trailing eight blocks. */ .align 8 .Lctr_enc_blk8: cmpq $8, %r8; jb .Lctr_enc_blk4; leaq -8(%r8), %r8; vbroadcasti128 (%rsi), %ymm3; vbroadcasti128 (0 * 16)(%rdi), %ymm4; /* detect if carry handling is needed */ addb $8, 15(%rsi); jc .Lctr_enc_blk8_handle_carry; /* Increment counters. */ vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0; vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1; vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2; vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3; leaq 8(%r10), %r10; .Lctr_enc_blk8_rounds: /* AES rounds */ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lctr_enc_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lctr_enc_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lctr_enc_blk8_last: vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */ vpxor (2 * 16)(%rcx), %ymm4, %ymm6; vpxor (4 * 16)(%rcx), %ymm4, %ymm7; vpxor (6 * 16)(%rcx), %ymm4, %ymm4; leaq (8 * 16)(%rcx), %rcx; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vaesenclast %ymm7, %ymm2, %ymm2; vaesenclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; jmp .Lctr_enc_blk4; .align 8 .Lctr_enc_blk8_handle_carry: /* Increment counters (handle carry). */ vpshufb %xmm13, %xmm3, %xmm1; /* be => le */ vmovdqa %xmm1, %xmm0; inc_le128(%xmm1, %xmm15, %xmm5); vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */ vpshufb %ymm13, %ymm3, %ymm0; addq $8, %r10; adcq $0, %r11; bswapq %r10; bswapq %r11; movq %r10, 8(%rsi); movq %r11, 0(%rsi); bswapq %r10; bswapq %r11; add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */ vpshufb %ymm13, %ymm3, %ymm1; add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */ vpshufb %ymm13, %ymm3, %ymm2; add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +7:+6 */ vpshufb %ymm13, %ymm3, %ymm3; jmp .Lctr_enc_blk8_rounds; /* Handle trailing four blocks. */ .align 8 .Lctr_enc_blk4: cmpq $4, %r8; jb .Lctr_enc_blk1; leaq -4(%r8), %r8; vbroadcasti128 (%rsi), %ymm3; vbroadcasti128 (0 * 16)(%rdi), %ymm4; /* detect if carry handling is needed */ addb $4, 15(%rsi); jc .Lctr_enc_blk4_handle_carry; /* Increment counters. */ vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0; vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1; leaq 4(%r10), %r10; .Lctr_enc_blk4_rounds: /* AES rounds */ XOR2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lctr_enc_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lctr_enc_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lctr_enc_blk4_last: vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */ vpxor (2 * 16)(%rcx), %ymm4, %ymm6; leaq (4 * 16)(%rcx), %rcx; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; jmp .Lctr_enc_blk1; .align 8 .Lctr_enc_blk4_handle_carry: /* Increment counters (handle carry). */ vpshufb %xmm13, %xmm3, %xmm1; /* be => le */ vmovdqa %xmm1, %xmm0; inc_le128(%xmm1, %xmm15, %xmm5); vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */ vpshufb %ymm13, %ymm3, %ymm0; addq $4, %r10; adcq $0, %r11; bswapq %r10; bswapq %r11; movq %r10, 8(%rsi); movq %r11, 0(%rsi); bswapq %r10; bswapq %r11; add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */ vpshufb %ymm13, %ymm3, %ymm1; jmp .Lctr_enc_blk4_rounds; /* Process trailing one to three blocks, one per loop. */ .align 8 .Lctr_enc_blk1: cmpq $1, %r8; jb .Ldone_ctr_enc; leaq -1(%r8), %r8; /* Load and increament counter. */ vmovdqu (%rsi), %xmm0; addq $1, %r10; adcq $0, %r11; bswapq %r10; bswapq %r11; movq %r10, 8(%rsi); movq %r11, 0(%rsi); bswapq %r10; bswapq %r11; /* AES rounds. */ vpxor (0 * 16)(%rdi), %xmm0, %xmm0; vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (10 * 16)(%rdi), %xmm1; cmpl $12, %r9d; jb .Lctr_enc_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (12 * 16)(%rdi), %xmm1; jz .Lctr_enc_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (14 * 16)(%rdi), %xmm1; /* Last round and output handling. */ .Lctr_enc_blk1_last: vpxor (%rcx), %xmm1, %xmm1; /* Xor src to last round key. */ leaq 16(%rcx), %rcx; vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */ vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Lctr_enc_blk1; .align 8 .Ldone_ctr_enc: vzeroall; xorl %r10d, %r10d; xorl %r11d, %r11d; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64) /********************************************************************** Little-endian 32-bit CTR-mode encryption (GCM-SIV) **********************************************************************/ ELF(.type _gcry_vaes_avx2_ctr32le_enc_amd64,@function) .globl _gcry_vaes_avx2_ctr32le_enc_amd64 _gcry_vaes_avx2_ctr32le_enc_amd64: /* input: * %rdi: round keys * %rsi: counter * %rdx: dst * %rcx: src * %r8: nblocks * %r9: nrounds */ CFI_STARTPROC(); vbroadcasti128 (%rsi), %ymm15; // CTR /* Process 16 blocks per loop. */ .align 8 .Lctr32le_enc_blk16: cmpq $16, %r8; jb .Lctr32le_enc_blk8; leaq -16(%r8), %r8; vbroadcasti128 (0 * 16)(%rdi), %ymm8; /* Increment counters. */ vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0; vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1; vpaddd .Lle_addd_4 rRIP, %ymm15, %ymm2; vpaddd .Lle_addd_6 rRIP, %ymm15, %ymm3; vpaddd .Lle_addd_8 rRIP, %ymm15, %ymm4; vpaddd .Lle_addd_10 rRIP, %ymm15, %ymm5; vpaddd .Lle_addd_12 rRIP, %ymm15, %ymm6; vpaddd .Lle_addd_14 rRIP, %ymm15, %ymm7; vpaddd .Lle_addd_16_2 rRIP, %ymm15, %ymm15; /* AES rounds */ XOR8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (1 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (2 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (3 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (4 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (5 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (6 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (7 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (8 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (9 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (10 * 16)(%rdi), %ymm8; cmpl $12, %r9d; jb .Lctr32le_enc_blk16_last; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (11 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (12 * 16)(%rdi), %ymm8; jz .Lctr32le_enc_blk16_last; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (13 * 16)(%rdi), %ymm8; VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (14 * 16)(%rdi), %ymm8; /* Last round and output handling. */ .Lctr32le_enc_blk16_last: vpxor (0 * 16)(%rcx), %ymm8, %ymm9; /* Xor src to last round key. */ vpxor (2 * 16)(%rcx), %ymm8, %ymm10; vpxor (4 * 16)(%rcx), %ymm8, %ymm11; vpxor (6 * 16)(%rcx), %ymm8, %ymm12; vaesenclast %ymm9, %ymm0, %ymm0; vaesenclast %ymm10, %ymm1, %ymm1; vaesenclast %ymm11, %ymm2, %ymm2; vaesenclast %ymm12, %ymm3, %ymm3; vpxor (8 * 16)(%rcx), %ymm8, %ymm9; vpxor (10 * 16)(%rcx), %ymm8, %ymm10; vpxor (12 * 16)(%rcx), %ymm8, %ymm11; vpxor (14 * 16)(%rcx), %ymm8, %ymm8; leaq (16 * 16)(%rcx), %rcx; vaesenclast %ymm9, %ymm4, %ymm4; vaesenclast %ymm10, %ymm5, %ymm5; vaesenclast %ymm11, %ymm6, %ymm6; vaesenclast %ymm8, %ymm7, %ymm7; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); vmovdqu %ymm4, (8 * 16)(%rdx); vmovdqu %ymm5, (10 * 16)(%rdx); vmovdqu %ymm6, (12 * 16)(%rdx); vmovdqu %ymm7, (14 * 16)(%rdx); leaq (16 * 16)(%rdx), %rdx; jmp .Lctr32le_enc_blk16; /* Handle trailing eight blocks. */ .align 8 .Lctr32le_enc_blk8: cmpq $8, %r8; jb .Lctr32le_enc_blk4; leaq -8(%r8), %r8; vbroadcasti128 (0 * 16)(%rdi), %ymm4; /* Increment counters. */ vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0; vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1; vpaddd .Lle_addd_4 rRIP, %ymm15, %ymm2; vpaddd .Lle_addd_6 rRIP, %ymm15, %ymm3; vpaddd .Lle_addd_8_2 rRIP, %ymm15, %ymm15; /* AES rounds */ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lctr32le_enc_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lctr32le_enc_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lctr32le_enc_blk8_last: vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */ vpxor (2 * 16)(%rcx), %ymm4, %ymm6; vpxor (4 * 16)(%rcx), %ymm4, %ymm7; vpxor (6 * 16)(%rcx), %ymm4, %ymm4; leaq (8 * 16)(%rcx), %rcx; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vaesenclast %ymm7, %ymm2, %ymm2; vaesenclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; /* Handle trailing four blocks. */ .align 8 .Lctr32le_enc_blk4: cmpq $4, %r8; jb .Lctr32le_enc_blk1; leaq -4(%r8), %r8; vbroadcasti128 (0 * 16)(%rdi), %ymm4; /* Increment counters. */ vpaddd .Lle_addd_0 rRIP, %ymm15, %ymm0; vpaddd .Lle_addd_2 rRIP, %ymm15, %ymm1; vpaddd .Lle_addd_4_2 rRIP, %ymm15, %ymm15; /* AES rounds */ XOR2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lctr32le_enc_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lctr32le_enc_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lctr32le_enc_blk4_last: vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */ vpxor (2 * 16)(%rcx), %ymm4, %ymm6; leaq (4 * 16)(%rcx), %rcx; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; /* Process trailing one to three blocks, one per loop. */ .align 8 .Lctr32le_enc_blk1: cmpq $1, %r8; jb .Ldone_ctr32le_enc; leaq -1(%r8), %r8; /* Load and increament counter. */ vmovdqu %xmm15, %xmm0; vpaddd .Lle_addd_1 rRIP, %xmm15, %xmm15; /* AES rounds. */ vpxor (0 * 16)(%rdi), %xmm0, %xmm0; vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (10 * 16)(%rdi), %xmm1; cmpl $12, %r9d; jb .Lctr32le_enc_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (12 * 16)(%rdi), %xmm1; jz .Lctr32le_enc_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (14 * 16)(%rdi), %xmm1; /* Last round and output handling. */ .Lctr32le_enc_blk1_last: vpxor (%rcx), %xmm1, %xmm1; /* Xor src to last round key. */ leaq 16(%rcx), %rcx; vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */ vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Lctr32le_enc_blk1; .align 8 .Ldone_ctr32le_enc: vmovdqu %xmm15, (%rsi); vzeroall; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64) /********************************************************************** OCB-mode encryption/decryption **********************************************************************/ ELF(.type _gcry_vaes_avx2_ocb_checksum,@function) _gcry_vaes_avx2_ocb_checksum: /* input: * %rax: offset pointer * %r10: plaintext pointer * %r11: nblocks */ CFI_STARTPROC(); vpxor %xmm0, %xmm0, %xmm0; cmpq $4, %r11; jb .Locb_checksum_blk1; vpxor %xmm1, %xmm1, %xmm1; vpxor %xmm2, %xmm2, %xmm2; vpxor %xmm3, %xmm3, %xmm3; cmpq $16, %r11; jb .Locb_checksum_blk4; vpxor %xmm4, %xmm4, %xmm4; vpxor %xmm5, %xmm5, %xmm5; vpxor %xmm6, %xmm6, %xmm6; vpxor %xmm7, %xmm7, %xmm7; cmpq $32, %r11; jb .Locb_checksum_blk16; vpxor %xmm8, %xmm8, %xmm8; vpxor %xmm9, %xmm9, %xmm9; vpxor %xmm10, %xmm10, %xmm10; vpxor %xmm11, %xmm11, %xmm11; vpxor %xmm12, %xmm12, %xmm12; vpxor %xmm13, %xmm13, %xmm13; vpxor %xmm14, %xmm14, %xmm14; vpxor %xmm15, %xmm15, %xmm15; .align 8 .Locb_checksum_blk32: cmpq $32, %r11; jb .Locb_checksum_blk32_done; leaq -32(%r11), %r11; vpxor (0 * 16)(%r10), %ymm0, %ymm0; vpxor (2 * 16)(%r10), %ymm1, %ymm1; vpxor (4 * 16)(%r10), %ymm2, %ymm2; vpxor (6 * 16)(%r10), %ymm3, %ymm3; vpxor (8 * 16)(%r10), %ymm4, %ymm4; vpxor (10 * 16)(%r10), %ymm5, %ymm5; vpxor (12 * 16)(%r10), %ymm6, %ymm6; vpxor (14 * 16)(%r10), %ymm7, %ymm7; vpxor (16 * 16)(%r10), %ymm8, %ymm8; vpxor (18 * 16)(%r10), %ymm9, %ymm9; vpxor (20 * 16)(%r10), %ymm10, %ymm10; vpxor (22 * 16)(%r10), %ymm11, %ymm11; vpxor (24 * 16)(%r10), %ymm12, %ymm12; vpxor (26 * 16)(%r10), %ymm13, %ymm13; vpxor (28 * 16)(%r10), %ymm14, %ymm14; vpxor (30 * 16)(%r10), %ymm15, %ymm15; leaq (32 * 16)(%r10), %r10; jmp .Locb_checksum_blk32; .align 8 .Locb_checksum_blk32_done: vpxor %ymm8, %ymm0, %ymm0; vpxor %ymm9, %ymm1, %ymm1; vpxor %ymm10, %ymm2, %ymm2; vpxor %ymm11, %ymm3, %ymm3; vpxor %ymm12, %ymm4, %ymm4; vpxor %ymm13, %ymm5, %ymm5; vpxor %ymm14, %ymm6, %ymm6; vpxor %ymm15, %ymm7, %ymm7; .align 8 .Locb_checksum_blk16: cmpq $16, %r11; jb .Locb_checksum_blk16_done; leaq -16(%r11), %r11; vpxor (0 * 16)(%r10), %ymm0, %ymm0; vpxor (2 * 16)(%r10), %ymm1, %ymm1; vpxor (4 * 16)(%r10), %ymm2, %ymm2; vpxor (6 * 16)(%r10), %ymm3, %ymm3; vpxor (8 * 16)(%r10), %ymm4, %ymm4; vpxor (10 * 16)(%r10), %ymm5, %ymm5; vpxor (12 * 16)(%r10), %ymm6, %ymm6; vpxor (14 * 16)(%r10), %ymm7, %ymm7; leaq (16 * 16)(%r10), %r10; jmp .Locb_checksum_blk16; .align 8 .Locb_checksum_blk16_done: vpxor %ymm4, %ymm0, %ymm0; vpxor %ymm5, %ymm1, %ymm1; vpxor %ymm6, %ymm2, %ymm2; vpxor %ymm7, %ymm3, %ymm3; vextracti128 $1, %ymm0, %xmm4; vextracti128 $1, %ymm1, %xmm5; vextracti128 $1, %ymm2, %xmm6; vextracti128 $1, %ymm3, %xmm7; vpxor %xmm4, %xmm0, %xmm0; vpxor %xmm5, %xmm1, %xmm1; vpxor %xmm6, %xmm2, %xmm2; vpxor %xmm7, %xmm3, %xmm3; .align 8 .Locb_checksum_blk4: cmpq $4, %r11; jb .Locb_checksum_blk4_done; leaq -4(%r11), %r11; vpxor (0 * 16)(%r10), %xmm0, %xmm0; vpxor (1 * 16)(%r10), %xmm1, %xmm1; vpxor (2 * 16)(%r10), %xmm2, %xmm2; vpxor (3 * 16)(%r10), %xmm3, %xmm3; leaq (4 * 16)(%r10), %r10; jmp .Locb_checksum_blk4; .align 8 .Locb_checksum_blk4_done: vpxor %xmm1, %xmm0, %xmm0; vpxor %xmm3, %xmm2, %xmm2; vpxor %xmm2, %xmm0, %xmm0; .align 8 .Locb_checksum_blk1: cmpq $1, %r11; jb .Locb_checksum_done; leaq -1(%r11), %r11; vpxor (%r10), %xmm0, %xmm0; leaq 16(%r10), %r10; jmp .Locb_checksum_blk1; .align 8 .Locb_checksum_done: vpxor (%rax), %xmm0, %xmm0; vmovdqu %xmm0, (%rax); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum) ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64,@function) .globl _gcry_vaes_avx2_ocb_crypt_amd64 _gcry_vaes_avx2_ocb_crypt_amd64: /* input: * %rdi: round keys * %esi: nblk * %rdx: dst * %rcx: src * %r8: nblocks * %r9: nrounds * 16(%rbp): offset * 24(%rbp): checksum * 32(%rbp): L-array * 40(%rbp): encrypt (%r15d) */ CFI_STARTPROC(); #define STACK_REGS_POS (16 * 16 + 4 * 16) #define STACK_ALLOC (STACK_REGS_POS + 6 * 8) pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); subq $STACK_ALLOC, %rsp; andq $~63, %rsp; movq %r12, (STACK_REGS_POS + 0 * 8)(%rsp); CFI_REG_ON_STACK(r12, STACK_REGS_POS + 0 * 8); movq %r13, (STACK_REGS_POS + 1 * 8)(%rsp); CFI_REG_ON_STACK(r13, STACK_REGS_POS + 1 * 8); movq %r14, (STACK_REGS_POS + 2 * 8)(%rsp); CFI_REG_ON_STACK(r14, STACK_REGS_POS + 2 * 8); movq %r15, (STACK_REGS_POS + 3 * 8)(%rsp); CFI_REG_ON_STACK(r15, STACK_REGS_POS + 3 * 8); movl 40(%rbp), %r15d; /* encrypt-flag. */ movq 16(%rbp), %r14; /* offset ptr. */ /* Handle encryption checksumming. */ testl %r15d, %r15d; jz .Locb_dec_checksum_prepare; movq 24(%rbp), %rax; /* checksum ptr. */ movq %rcx, %r10; movq %r8, %r11; call _gcry_vaes_avx2_ocb_checksum; jmp .Locb_enc_checksum_done; .Locb_dec_checksum_prepare: /* Store plaintext address and number of blocks for decryption * checksumming. */ movq %rdx, (STACK_REGS_POS + 4 * 8)(%rsp); movq %r8, (STACK_REGS_POS + 5 * 8)(%rsp); .Locb_enc_checksum_done: vmovdqu (%r14), %xmm15; /* Load offset. */ movq 32(%rbp), %r14; /* L-array ptr. */ vmovdqa (0 * 16)(%rdi), %xmm0; /* first key */ movl $(10 * 16), %eax; cmpl $12, %r9d; jb .Llast_key_ptr; movl $(12 * 16), %eax; je .Llast_key_ptr; movl $(14 * 16), %eax; .align 8 .Llast_key_ptr: vpxor (%rdi, %rax), %xmm0, %xmm0; /* first key ^ last key */ vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key */ vmovdqa %xmm0, (14 * 16)(%rsp); vmovdqa %xmm0, (15 * 16)(%rsp); .align 8 .Lhandle_unaligned_ocb: /* Get number of blocks to align nblk to 16 (and L-array optimization). */ movl %esi, %r10d; negl %r10d; andl $15, %r10d; cmpq %r8, %r10; cmovaq %r8, %r10; cmpq $1, %r10; jb .Lunaligned_ocb_done; /* Number of blocks after alignment. */ movq %r8, %r11; subq %r10, %r11; /* If number after alignment is less than 16, skip aligned handling * completely. */ cmp $16, %r11; cmovbq %r8, %r10; /* Unaligned: Process eight blocks per loop. */ .align 8 .Locb_unaligned_blk8: cmpq $8, %r10; jb .Locb_unaligned_blk4; leaq -8(%r8), %r8; leaq -8(%r10), %r10; leal 1(%esi), %r11d; leal 2(%esi), %r12d; leal 3(%esi), %r13d; leal 4(%esi), %eax; tzcntl %r11d, %r11d; tzcntl %r12d, %r12d; tzcntl %r13d, %r13d; tzcntl %eax, %eax; shll $4, %r11d; shll $4, %r12d; shll $4, %r13d; shll $4, %eax; vpxor (%r14, %r11), %xmm15, %xmm5; vpxor (%r14, %r12), %xmm5, %xmm6; vpxor (%r14, %r13), %xmm6, %xmm7; vpxor (%r14, %rax), %xmm7, %xmm8; leal 5(%esi), %r11d; leal 6(%esi), %r12d; leal 7(%esi), %r13d; leal 8(%esi), %esi; tzcntl %r11d, %r11d; tzcntl %r12d, %r12d; tzcntl %r13d, %r13d; tzcntl %esi, %eax; shll $4, %r11d; shll $4, %r12d; shll $4, %r13d; shll $4, %eax; vpxor (%r14, %r11), %xmm8, %xmm9; vpxor (%r14, %r12), %xmm9, %xmm10; vpxor (%r14, %r13), %xmm10, %xmm11; vpxor (%r14, %rax), %xmm11, %xmm15; vinserti128 $1, %xmm6, %ymm5, %ymm5; vinserti128 $1, %xmm8, %ymm7, %ymm6; vinserti128 $1, %xmm10, %ymm9, %ymm7; vinserti128 $1, %xmm15, %ymm11, %ymm8; vpxor (0 * 16)(%rcx), %ymm5, %ymm0; vpxor (2 * 16)(%rcx), %ymm6, %ymm1; vpxor (4 * 16)(%rcx), %ymm7, %ymm2; vpxor (6 * 16)(%rcx), %ymm8, %ymm3; leaq (8 * 16)(%rcx), %rcx; vmovdqa (14 * 16)(%rsp), %ymm9; testl %r15d, %r15d; jz .Locb_unaligned_blk8_dec; /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); cmpl $12, %r9d; jb .Locb_unaligned_blk8_enc_last; vbroadcasti128 (10 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); jz .Locb_unaligned_blk8_enc_last; vbroadcasti128 (12 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); /* Last round and output handling. */ .Locb_unaligned_blk8_enc_last: vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */ vpxor %ymm6, %ymm9, %ymm6; vpxor %ymm7, %ymm9, %ymm7; vpxor %ymm8, %ymm9, %ymm4; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vaesenclast %ymm7, %ymm2, %ymm2; vaesenclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; jmp .Locb_unaligned_blk8; .align 8 .Locb_unaligned_blk8_dec: /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); cmpl $12, %r9d; jb .Locb_unaligned_blk8_dec_last; vbroadcasti128 (10 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); jz .Locb_unaligned_blk8_dec_last; vbroadcasti128 (12 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); /* Last round and output handling. */ .Locb_unaligned_blk8_dec_last: vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */ vpxor %ymm6, %ymm9, %ymm6; vpxor %ymm7, %ymm9, %ymm7; vpxor %ymm8, %ymm9, %ymm4; vaesdeclast %ymm5, %ymm0, %ymm0; vaesdeclast %ymm6, %ymm1, %ymm1; vaesdeclast %ymm7, %ymm2, %ymm2; vaesdeclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; jmp .Locb_unaligned_blk8; /* Unaligned: Process four blocks. */ .align 8 .Locb_unaligned_blk4: cmpq $4, %r10; jb .Locb_unaligned_blk1; leaq -4(%r8), %r8; leaq -4(%r10), %r10; leal 1(%esi), %r11d; leal 2(%esi), %r12d; leal 3(%esi), %r13d; leal 4(%esi), %esi; tzcntl %r11d, %r11d; tzcntl %r12d, %r12d; tzcntl %r13d, %r13d; tzcntl %esi, %eax; shll $4, %r11d; shll $4, %r12d; shll $4, %r13d; shll $4, %eax; vpxor (%r14, %r11), %xmm15, %xmm5; vpxor (%r14, %r12), %xmm5, %xmm6; vinserti128 $1, %xmm6, %ymm5, %ymm5; vpxor (%r14, %r13), %xmm6, %xmm7; vpxor (%r14, %rax), %xmm7, %xmm15; vinserti128 $1, %xmm15, %ymm7, %ymm6; vpxor (0 * 16)(%rcx), %ymm5, %ymm0; vpxor (2 * 16)(%rcx), %ymm6, %ymm1; leaq (4 * 16)(%rcx), %rcx; testl %r15d, %r15d; jz .Locb_unaligned_blk4_dec; /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); cmpl $12, %r9d; jb .Locb_unaligned_blk4_enc_last; vbroadcasti128 (10 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); jz .Locb_unaligned_blk4_enc_last; vbroadcasti128 (12 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); /* Last round and output handling. */ .Locb_unaligned_blk4_enc_last: vmovdqa (14 * 16)(%rsp), %ymm8; vpxor %ymm5, %ymm8, %ymm5; /* Xor src to last round key. */ vpxor %ymm6, %ymm8, %ymm6; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; jmp .Locb_unaligned_blk1; .align 8 .Locb_unaligned_blk4_dec: /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); cmpl $12, %r9d; jb .Locb_unaligned_blk4_dec_last; vbroadcasti128 (10 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); jz .Locb_unaligned_blk4_dec_last; vbroadcasti128 (12 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); /* Last round and output handling. */ .Locb_unaligned_blk4_dec_last: vmovdqa (14 * 16)(%rsp), %ymm8; vpxor %ymm5, %ymm8, %ymm5; /* Xor src to last round key. */ vpxor %ymm6, %ymm8, %ymm6; vaesdeclast %ymm5, %ymm0, %ymm0; vaesdeclast %ymm6, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; /* Unaligned: Process one block per loop. */ .align 8 .Locb_unaligned_blk1: cmpq $1, %r10; jb .Lunaligned_ocb_done; leaq -1(%r8), %r8; leaq -1(%r10), %r10; leal 1(%esi), %esi; tzcntl %esi, %r11d; shll $4, %r11d; vpxor (%r14, %r11), %xmm15, %xmm15; vpxor (%rcx), %xmm15, %xmm0; leaq 16(%rcx), %rcx; testl %r15d, %r15d; jz .Locb_unaligned_blk1_dec; /* AES rounds. */ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; cmpl $12, %r9d; jb .Locb_unaligned_blk1_enc_last; vaesenc (10 * 16)(%rdi), %xmm0, %xmm0; vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; jz .Locb_unaligned_blk1_enc_last; vaesenc (12 * 16)(%rdi), %xmm0, %xmm0; vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; /* Last round and output handling. */ .Locb_unaligned_blk1_enc_last: vpxor (14 * 16)(%rsp), %xmm15, %xmm1; vaesenclast %xmm1, %xmm0, %xmm0; vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Locb_unaligned_blk1; .align 8 .Locb_unaligned_blk1_dec: /* AES rounds. */ vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; cmpl $12, %r9d; jb .Locb_unaligned_blk1_dec_last; vaesdec (10 * 16)(%rdi), %xmm0, %xmm0; vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; jz .Locb_unaligned_blk1_dec_last; vaesdec (12 * 16)(%rdi), %xmm0, %xmm0; vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; /* Last round and output handling. */ .Locb_unaligned_blk1_dec_last: vpxor (14 * 16)(%rsp), %xmm15, %xmm1; vaesdeclast %xmm1, %xmm0, %xmm0; vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Locb_unaligned_blk1; .align 8 .Lunaligned_ocb_done: cmpq $1, %r8; jb .Ldone_ocb; /* Short buffers do not benefit from L-array optimization. */ movq %r8, %r10; cmpq $16, %r8; jb .Locb_unaligned_blk8; vinserti128 $1, %xmm15, %ymm15, %ymm15; /* Prepare L-array optimization. * Since nblk is aligned to 16, offsets will have following * construction: * - block1 = ntz{0} = offset ^ L[0] * - block2 = ntz{1} = offset ^ L[0] ^ L[1] * - block3 = ntz{0} = offset ^ L[1] * - block4 = ntz{2} = offset ^ L[1] ^ L[2] * - block5 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[2] * - block6 = ntz{1} = offset ^ L[0] ^ L[2] * - block7 = ntz{0} = offset ^ L[2] * - block8 = ntz{3} = offset ^ L[2] ^ L[3] * - block9 = ntz{0} = offset ^ L[0] ^ L[2] ^ L[3] * - block10 = ntz{1} = offset ^ L[0] ^ L[1] ^ L[2] ^ L[3] * - block11 = ntz{0} = offset ^ L[1] ^ L[2] ^ L[3] * - block12 = ntz{2} = offset ^ L[1] ^ L[3] * - block13 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[3] * - block14 = ntz{1} = offset ^ L[0] ^ L[3] * - block15 = ntz{0} = offset ^ L[3] * - block16 = ntz{x} = offset ^ L[3] ^ L[ntz{x}] */ vmovdqu (0 * 16)(%r14), %xmm0; vmovdqu (1 * 16)(%r14), %xmm1; vmovdqu (2 * 16)(%r14), %xmm2; vmovdqu (3 * 16)(%r14), %xmm3; vpxor %xmm0, %xmm1, %xmm4; /* L[0] ^ L[1] */ vpxor %xmm0, %xmm2, %xmm5; /* L[0] ^ L[2] */ vpxor %xmm0, %xmm3, %xmm6; /* L[0] ^ L[3] */ vpxor %xmm1, %xmm2, %xmm7; /* L[1] ^ L[2] */ vpxor %xmm1, %xmm3, %xmm8; /* L[1] ^ L[3] */ vpxor %xmm2, %xmm3, %xmm9; /* L[2] ^ L[3] */ vpxor %xmm4, %xmm2, %xmm10; /* L[0] ^ L[1] ^ L[2] */ vpxor %xmm5, %xmm3, %xmm11; /* L[0] ^ L[2] ^ L[3] */ vpxor %xmm7, %xmm3, %xmm12; /* L[1] ^ L[2] ^ L[3] */ vpxor %xmm0, %xmm8, %xmm13; /* L[0] ^ L[1] ^ L[3] */ vpxor %xmm4, %xmm9, %xmm14; /* L[0] ^ L[1] ^ L[2] ^ L[3] */ vinserti128 $1, %xmm4, %ymm0, %ymm0; vinserti128 $1, %xmm7, %ymm1, %ymm1; vinserti128 $1, %xmm5, %ymm10, %ymm10; vinserti128 $1, %xmm9, %ymm2, %ymm2; vinserti128 $1, %xmm14, %ymm11, %ymm11; vinserti128 $1, %xmm8, %ymm12, %ymm12; vinserti128 $1, %xmm6, %ymm13, %ymm13; vmovdqa %ymm0, (0 * 16)(%rsp); vmovdqa %ymm1, (2 * 16)(%rsp); vmovdqa %ymm10, (4 * 16)(%rsp); vmovdqa %ymm2, (6 * 16)(%rsp); vmovdqa %ymm11, (8 * 16)(%rsp); vmovdqa %ymm12, (10 * 16)(%rsp); vmovdqa %ymm13, (12 * 16)(%rsp); /* Aligned: Process 16 blocks per loop. */ .align 8 .Locb_aligned_blk16: cmpq $16, %r8; jb .Locb_aligned_blk8; leaq -16(%r8), %r8; leal 16(%esi), %esi; tzcntl %esi, %eax; shll $4, %eax; vpxor (0 * 16)(%rsp), %ymm15, %ymm8; vpxor (2 * 16)(%rsp), %ymm15, %ymm9; vpxor (4 * 16)(%rsp), %ymm15, %ymm10; vpxor (6 * 16)(%rsp), %ymm15, %ymm11; vpxor (8 * 16)(%rsp), %ymm15, %ymm12; vpxor (3 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[3] */ vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */ vinserti128 $1, %xmm14, %ymm13, %ymm14; vpxor (10 * 16)(%rsp), %ymm15, %ymm13; vpxor (14 * 16)(%rcx), %ymm14, %ymm7; vpxor (0 * 16)(%rcx), %ymm8, %ymm0; vpxor (2 * 16)(%rcx), %ymm9, %ymm1; vpxor (4 * 16)(%rcx), %ymm10, %ymm2; vpxor (6 * 16)(%rcx), %ymm11, %ymm3; vpxor (8 * 16)(%rcx), %ymm12, %ymm4; vpxor (10 * 16)(%rcx), %ymm13, %ymm5; vmovdqa %ymm13, (16 * 16)(%rsp); vpxor (12 * 16)(%rsp), %ymm15, %ymm13; vpxor (12 * 16)(%rcx), %ymm13, %ymm6; vmovdqa %ymm13, (18 * 16)(%rsp); leaq (16 * 16)(%rcx), %rcx; vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; testl %r15d, %r15d; jz .Locb_aligned_blk16_dec; /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (2 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (3 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (4 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (5 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (6 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (7 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (8 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (9 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); cmpl $12, %r9d; jb .Locb_aligned_blk16_enc_last; vbroadcasti128 (10 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (11 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); jz .Locb_aligned_blk16_enc_last; vbroadcasti128 (12 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (13 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); /* Last round and output handling. */ .Locb_aligned_blk16_enc_last: vmovdqa (14 * 16)(%rsp), %ymm13; vpxor %ymm8, %ymm13, %ymm8; vpxor %ymm9, %ymm13, %ymm9; vpxor %ymm10, %ymm13, %ymm10; vpxor %ymm11, %ymm13, %ymm11; vaesenclast %ymm8, %ymm0, %ymm0; vaesenclast %ymm9, %ymm1, %ymm1; vaesenclast %ymm10, %ymm2, %ymm2; vaesenclast %ymm11, %ymm3, %ymm3; vpxor %ymm12, %ymm13, %ymm12; vpxor (16 * 16)(%rsp), %ymm13, %ymm8; vpxor (18 * 16)(%rsp), %ymm13, %ymm9; vpxor %ymm14, %ymm13, %ymm13; vaesenclast %ymm12, %ymm4, %ymm4; vaesenclast %ymm8, %ymm5, %ymm5; vaesenclast %ymm9, %ymm6, %ymm6; vaesenclast %ymm13, %ymm7, %ymm7; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); vmovdqu %ymm4, (8 * 16)(%rdx); vmovdqu %ymm5, (10 * 16)(%rdx); vmovdqu %ymm6, (12 * 16)(%rdx); vmovdqu %ymm7, (14 * 16)(%rdx); leaq (16 * 16)(%rdx), %rdx; jmp .Locb_aligned_blk16; .align 8 .Locb_aligned_blk16_dec: /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (2 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (3 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (4 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (5 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (6 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (7 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (8 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (9 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); cmpl $12, %r9d; jb .Locb_aligned_blk16_dec_last; vbroadcasti128 (10 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (11 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); jz .Locb_aligned_blk16_dec_last; vbroadcasti128 (12 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); vbroadcasti128 (13 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); /* Last round and output handling. */ .Locb_aligned_blk16_dec_last: vmovdqa (14 * 16)(%rsp), %ymm13; vpxor %ymm8, %ymm13, %ymm8; vpxor %ymm9, %ymm13, %ymm9; vpxor %ymm10, %ymm13, %ymm10; vpxor %ymm11, %ymm13, %ymm11; vaesdeclast %ymm8, %ymm0, %ymm0; vaesdeclast %ymm9, %ymm1, %ymm1; vaesdeclast %ymm10, %ymm2, %ymm2; vaesdeclast %ymm11, %ymm3, %ymm3; vpxor %ymm12, %ymm13, %ymm12; vpxor (16 * 16)(%rsp), %ymm13, %ymm8; vpxor (18 * 16)(%rsp), %ymm13, %ymm9; vpxor %ymm14, %ymm13, %ymm13; vaesdeclast %ymm12, %ymm4, %ymm4; vaesdeclast %ymm8, %ymm5, %ymm5; vaesdeclast %ymm9, %ymm6, %ymm6; vaesdeclast %ymm13, %ymm7, %ymm7; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); vmovdqu %ymm4, (8 * 16)(%rdx); vmovdqu %ymm5, (10 * 16)(%rdx); vmovdqu %ymm6, (12 * 16)(%rdx); vmovdqu %ymm7, (14 * 16)(%rdx); leaq (16 * 16)(%rdx), %rdx; jmp .Locb_aligned_blk16; /* Aligned: Process trailing eight blocks. */ .align 8 .Locb_aligned_blk8: cmpq $8, %r8; jb .Locb_aligned_done; leaq -8(%r8), %r8; leal 8(%esi), %esi; tzcntl %esi, %eax; shll $4, %eax; vpxor (0 * 16)(%rsp), %ymm15, %ymm5; vpxor (2 * 16)(%rsp), %ymm15, %ymm6; vpxor (4 * 16)(%rsp), %ymm15, %ymm7; vpxor (2 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[2] */ vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */ vinserti128 $1, %xmm14, %ymm13, %ymm14; vpxor (0 * 16)(%rcx), %ymm5, %ymm0; vpxor (2 * 16)(%rcx), %ymm6, %ymm1; vpxor (4 * 16)(%rcx), %ymm7, %ymm2; vpxor (6 * 16)(%rcx), %ymm14, %ymm3; leaq (8 * 16)(%rcx), %rcx; vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; vmovdqa (14 * 16)(%rsp), %ymm8; testl %r15d, %r15d; jz .Locb_aligned_blk8_dec; /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); cmpl $12, %r9d; jb .Locb_aligned_blk8_enc_last; vbroadcasti128 (10 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); jz .Locb_aligned_blk8_enc_last; vbroadcasti128 (12 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); /* Last round and output handling. */ .Locb_aligned_blk8_enc_last: vpxor %ymm5, %ymm8, %ymm5; vpxor %ymm6, %ymm8, %ymm6; vpxor %ymm7, %ymm8, %ymm7; vpxor %ymm14, %ymm8, %ymm4; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vaesenclast %ymm7, %ymm2, %ymm2; vaesenclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; jmp .Locb_aligned_done; .align 8 .Locb_aligned_blk8_dec: /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); cmpl $12, %r9d; jb .Locb_aligned_blk8_dec_last; vbroadcasti128 (10 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); jz .Locb_aligned_blk8_dec_last; vbroadcasti128 (12 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Locb_aligned_blk8_dec_last: vpxor %ymm5, %ymm8, %ymm5; vpxor %ymm6, %ymm8, %ymm6; vpxor %ymm7, %ymm8, %ymm7; vpxor %ymm14, %ymm8, %ymm4; vaesdeclast %ymm5, %ymm0, %ymm0; vaesdeclast %ymm6, %ymm1, %ymm1; vaesdeclast %ymm7, %ymm2, %ymm2; vaesdeclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; .align 8 .Locb_aligned_done: /* Burn stack. */ vpxor %ymm0, %ymm0, %ymm0; vmovdqa %ymm0, (0 * 16)(%rsp); vmovdqa %ymm0, (2 * 16)(%rsp); vmovdqa %ymm0, (4 * 16)(%rsp); vmovdqa %ymm0, (6 * 16)(%rsp); vmovdqa %ymm0, (8 * 16)(%rsp); vmovdqa %ymm0, (10 * 16)(%rsp); vmovdqa %ymm0, (12 * 16)(%rsp); vmovdqa %ymm0, (16 * 16)(%rsp); vmovdqa %ymm0, (18 * 16)(%rsp); /* Handle tailing 1…7 blocks in nblk-unaligned loop. */ movq %r8, %r10; cmpq $1, %r8; jnb .Locb_unaligned_blk8; .align 8 .Ldone_ocb: movq 16(%rbp), %r14; /* offset ptr. */ vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key ^ first key */ vmovdqu %xmm15, (%r14); /* Store offset. */ /* Handle decryption checksumming. */ testl %r15d, %r15d; jnz .Locb_dec_checksum_done; movq 24(%rbp), %rax; /* checksum ptr. */ movq (STACK_REGS_POS + 4 * 8)(%rsp), %r10; movq (STACK_REGS_POS + 5 * 8)(%rsp), %r11; call _gcry_vaes_avx2_ocb_checksum; .Locb_dec_checksum_done: /* Burn stack. */ vpxor %ymm0, %ymm0, %ymm0; vmovdqa %ymm0, (14 * 16)(%rsp); vzeroall; movq (STACK_REGS_POS + 0 * 8)(%rsp), %r12; CFI_RESTORE(%r12); movq (STACK_REGS_POS + 1 * 8)(%rsp), %r13; CFI_RESTORE(%r13); movq (STACK_REGS_POS + 2 * 8)(%rsp), %r14; CFI_RESTORE(%r14); movq (STACK_REGS_POS + 3 * 8)(%rsp), %r15; CFI_RESTORE(%r15); leave; CFI_LEAVE(); - ret + ret_spec_stop #undef STACK_REGS_POS #undef STACK_ALLOC CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64) /********************************************************************** CTR-mode encryption **********************************************************************/ ELF(.type _gcry_vaes_avx2_xts_crypt_amd64,@function) .globl _gcry_vaes_avx2_xts_crypt_amd64 _gcry_vaes_avx2_xts_crypt_amd64: /* input: * %rdi: round keys * %rsi: tweak * %rdx: dst * %rcx: src * %r8: nblocks * %r9: nrounds * 8(%rsp): encrypt */ CFI_STARTPROC(); movl 8(%rsp), %eax; #define tweak_clmul(shift, out, tweak, hi_tweak, tmp1, tmp2) \ vpsrld $(32-(shift)), hi_tweak, tmp2; \ vpsllq $(shift), tweak, out; \ vpclmulqdq $0, .Lxts_gfmul_clmul rRIP, tmp2, tmp1; \ vpunpckhqdq tmp2, tmp1, tmp1; \ vpxor tmp1, out, out; /* Prepare tweak. */ vmovdqu (%rsi), %xmm15; vpshufb .Lxts_high_bit_shuf rRIP, %xmm15, %xmm13; tweak_clmul(1, %xmm11, %xmm15, %xmm13, %xmm0, %xmm1); vinserti128 $1, %xmm11, %ymm15, %ymm15; /* tweak:tweak1 */ vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; cmpq $8, %r8; jb .Lxts_crypt_blk4; /* Process eight blocks per loop. */ leaq -8(%r8), %r8; vmovdqa %ymm15, %ymm5; tweak_clmul(2, %ymm6, %ymm15, %ymm13, %ymm0, %ymm1); tweak_clmul(4, %ymm7, %ymm15, %ymm13, %ymm0, %ymm1); tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm0, %ymm1); tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm0, %ymm1); vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; vbroadcasti128 (0 * 16)(%rdi), %ymm4; vpxor (0 * 16)(%rcx), %ymm5, %ymm0; vpxor (2 * 16)(%rcx), %ymm6, %ymm1; vpxor (4 * 16)(%rcx), %ymm7, %ymm2; vpxor (6 * 16)(%rcx), %ymm8, %ymm3; leaq (8 * 16)(%rcx), %rcx; .align 8 .Lxts_crypt_blk8_loop: cmpq $8, %r8; jb .Lxts_crypt_blk8_tail; leaq -8(%r8), %r8; testl %eax, %eax; jz .Lxts_dec_blk8; /* AES rounds */ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vmovdqa %ymm15, %ymm9; tweak_clmul(2, %ymm10, %ymm15, %ymm13, %ymm12, %ymm14); tweak_clmul(4, %ymm11, %ymm15, %ymm13, %ymm12, %ymm14); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lxts_enc_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lxts_enc_blk8_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lxts_enc_blk8_last: vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ vpxor %ymm4, %ymm6, %ymm6; vpxor %ymm4, %ymm7, %ymm7; vpxor %ymm4, %ymm8, %ymm4; tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm12, %ymm14); tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm12, %ymm14); vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vaesenclast %ymm7, %ymm2, %ymm2; vaesenclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; vbroadcasti128 (0 * 16)(%rdi), %ymm4; vpxor (0 * 16)(%rcx), %ymm9, %ymm0; vpxor (2 * 16)(%rcx), %ymm10, %ymm1; vpxor (4 * 16)(%rcx), %ymm11, %ymm2; vpxor (6 * 16)(%rcx), %ymm8, %ymm3; vmovdqa %ymm9, %ymm5; vmovdqa %ymm10, %ymm6; vmovdqa %ymm11, %ymm7; leaq (8 * 16)(%rcx), %rcx; jmp .Lxts_crypt_blk8_loop; .align 8 .Lxts_dec_blk8: /* AES rounds */ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vmovdqa %ymm15, %ymm9; tweak_clmul(2, %ymm10, %ymm15, %ymm13, %ymm12, %ymm14); tweak_clmul(4, %ymm11, %ymm15, %ymm13, %ymm12, %ymm14); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lxts_dec_blk8_last; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lxts_dec_blk8_last; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lxts_dec_blk8_last: vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ vpxor %ymm4, %ymm6, %ymm6; vpxor %ymm4, %ymm7, %ymm7; vpxor %ymm4, %ymm8, %ymm4; tweak_clmul(6, %ymm8, %ymm15, %ymm13, %ymm12, %ymm14); tweak_clmul(8, %ymm15, %ymm15, %ymm13, %ymm12, %ymm14); vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; vaesdeclast %ymm5, %ymm0, %ymm0; vaesdeclast %ymm6, %ymm1, %ymm1; vaesdeclast %ymm7, %ymm2, %ymm2; vaesdeclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; vbroadcasti128 (0 * 16)(%rdi), %ymm4; vpxor (0 * 16)(%rcx), %ymm9, %ymm0; vpxor (2 * 16)(%rcx), %ymm10, %ymm1; vpxor (4 * 16)(%rcx), %ymm11, %ymm2; vpxor (6 * 16)(%rcx), %ymm8, %ymm3; vmovdqa %ymm9, %ymm5; vmovdqa %ymm10, %ymm6; vmovdqa %ymm11, %ymm7; leaq (8 * 16)(%rcx), %rcx; jmp .Lxts_crypt_blk8_loop; .align 8 .Lxts_crypt_blk8_tail: testl %eax, %eax; jz .Lxts_dec_tail_blk8; /* AES rounds */ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lxts_enc_blk8_tail_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lxts_enc_blk8_tail_last; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lxts_enc_blk8_tail_last: vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ vpxor %ymm4, %ymm6, %ymm6; vpxor %ymm4, %ymm7, %ymm7; vpxor %ymm4, %ymm8, %ymm4; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vaesenclast %ymm7, %ymm2, %ymm2; vaesenclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; jmp .Lxts_crypt_blk4; .align 8 .Lxts_dec_tail_blk8: /* AES rounds */ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lxts_dec_blk8_tail_last; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lxts_dec_blk8_tail_last; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lxts_dec_blk8_tail_last: vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ vpxor %ymm4, %ymm6, %ymm6; vpxor %ymm4, %ymm7, %ymm7; vpxor %ymm4, %ymm8, %ymm4; vaesdeclast %ymm5, %ymm0, %ymm0; vaesdeclast %ymm6, %ymm1, %ymm1; vaesdeclast %ymm7, %ymm2, %ymm2; vaesdeclast %ymm4, %ymm3, %ymm3; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; /* Handle trailing four blocks. */ .align 8 .Lxts_crypt_blk4: /* Try exit early as typically input length is large power of 2. */ cmpq $0, %r8; jb .Ldone_xts_crypt; cmpq $4, %r8; jb .Lxts_crypt_blk1; leaq -4(%r8), %r8; vmovdqa %ymm15, %ymm5; tweak_clmul(2, %ymm6, %ymm15, %ymm13, %ymm0, %ymm1); tweak_clmul(4, %ymm15, %ymm15, %ymm13, %ymm0, %ymm1); vpshufb .Lxts_high_bit_shuf rRIP, %ymm15, %ymm13; vbroadcasti128 (0 * 16)(%rdi), %ymm4; vpxor (0 * 16)(%rcx), %ymm5, %ymm0; vpxor (2 * 16)(%rcx), %ymm6, %ymm1; leaq (4 * 16)(%rcx), %rcx; testl %eax, %eax; jz .Lxts_dec_blk4; /* AES rounds */ XOR2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lxts_enc_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lxts_enc_blk4_last; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lxts_enc_blk4_last: vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ vpxor %ymm4, %ymm6, %ymm6; vaesenclast %ymm5, %ymm0, %ymm0; vaesenclast %ymm6, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; jmp .Lxts_crypt_blk1; .align 8 .Lxts_dec_blk4: /* AES rounds */ XOR2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (2 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (3 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (4 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (5 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (6 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (7 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (8 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (9 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (10 * 16)(%rdi), %ymm4; cmpl $12, %r9d; jb .Lxts_dec_blk4_last; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (11 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (12 * 16)(%rdi), %ymm4; jz .Lxts_dec_blk4_last; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (13 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); vbroadcasti128 (14 * 16)(%rdi), %ymm4; /* Last round and output handling. */ .Lxts_dec_blk4_last: vpxor %ymm4, %ymm5, %ymm5; /* Xor tweak to last round key. */ vpxor %ymm4, %ymm6, %ymm6; vaesdeclast %ymm5, %ymm0, %ymm0; vaesdeclast %ymm6, %ymm1, %ymm1; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; /* Process trailing one to three blocks, one per loop. */ .align 8 .Lxts_crypt_blk1: cmpq $1, %r8; jb .Ldone_xts_crypt; leaq -1(%r8), %r8; vpxor (%rcx), %xmm15, %xmm0; vmovdqa %xmm15, %xmm5; tweak_clmul(1, %xmm15, %xmm15, %xmm13, %xmm2, %xmm3); vpshufb .Lxts_high_bit_shuf rRIP, %xmm15, %xmm13; leaq 16(%rcx), %rcx; testl %eax, %eax; jz .Lxts_dec_blk1; /* AES rounds. */ vpxor (0 * 16)(%rdi), %xmm0, %xmm0; vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (10 * 16)(%rdi), %xmm1; cmpl $12, %r9d; jb .Lxts_enc_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (12 * 16)(%rdi), %xmm1; jz .Lxts_enc_blk1_last; vaesenc %xmm1, %xmm0, %xmm0; vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (14 * 16)(%rdi), %xmm1; /* Last round and output handling. */ .Lxts_enc_blk1_last: vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */ vaesenclast %xmm5, %xmm0, %xmm0; vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Lxts_crypt_blk1; .align 8 .Lxts_dec_blk1: /* AES rounds. */ vpxor (0 * 16)(%rdi), %xmm0, %xmm0; vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (10 * 16)(%rdi), %xmm1; cmpl $12, %r9d; jb .Lxts_dec_blk1_last; vaesdec %xmm1, %xmm0, %xmm0; vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (12 * 16)(%rdi), %xmm1; jz .Lxts_dec_blk1_last; vaesdec %xmm1, %xmm0, %xmm0; vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; vmovdqa (14 * 16)(%rdi), %xmm1; /* Last round and output handling. */ .Lxts_dec_blk1_last: vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */ vaesdeclast %xmm5, %xmm0, %xmm0; vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; jmp .Lxts_crypt_blk1; .align 8 .Ldone_xts_crypt: /* Store IV. */ vmovdqu %xmm15, (%rsi); vzeroall; xorl %eax, %eax - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64) /********************************************************************** constants **********************************************************************/ ELF(.type _gcry_vaes_consts,@object) _gcry_vaes_consts: .align 32 .Lbige_addb_0: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lbige_addb_1: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 .Lbige_addb_2: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 .Lbige_addb_3: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 .Lbige_addb_4: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 .Lbige_addb_5: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 .Lbige_addb_6: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 .Lbige_addb_7: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 .Lbige_addb_8: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 .Lbige_addb_9: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 .Lbige_addb_10: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 .Lbige_addb_11: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 .Lbige_addb_12: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 .Lbige_addb_13: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 .Lbige_addb_14: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 .Lbige_addb_15: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 .Lle_addd_0: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_1: .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_2: .byte 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_3: .byte 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_4: .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_5: .byte 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_6: .byte 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_7: .byte 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_8: .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_9: .byte 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_10: .byte 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_11: .byte 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_12: .byte 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_13: .byte 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_14: .byte 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_15: .byte 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_4_2: .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_8_2: .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lle_addd_16_2: .byte 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .byte 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .Lxts_gfmul_clmul: .long 0x00, 0x87, 0x00, 0x00 .long 0x00, 0x87, 0x00, 0x00 .Lxts_high_bit_shuf: .byte -1, -1, -1, -1, 12, 13, 14, 15 .byte 4, 5, 6, 7, -1, -1, -1, -1 .byte -1, -1, -1, -1, 12, 13, 14, 15 .byte 4, 5, 6, 7, -1, -1, -1, -1 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ELF(.size _gcry_vaes_consts,.-_gcry_vaes_consts) #endif /* HAVE_GCC_INLINE_ASM_VAES */ #endif /* __x86_64__ */ diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S index ae8f2715..64626063 100644 --- a/cipher/salsa20-amd64.S +++ b/cipher/salsa20-amd64.S @@ -1,940 +1,940 @@ /* salsa20-amd64.S - AMD64 implementation of Salsa20 * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on public domain implementation by D. J. Bernstein at * http://cr.yp.to/snuffle.html */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20) #include "asm-common-amd64.h" .text .align 8 .globl _gcry_salsa20_amd64_keysetup ELF(.type _gcry_salsa20_amd64_keysetup,@function;) _gcry_salsa20_amd64_keysetup: CFI_STARTPROC(); movl 0(%rsi),%r8d movl 4(%rsi),%r9d movl 8(%rsi),%eax movl 12(%rsi),%r10d movl %r8d,20(%rdi) movl %r9d,40(%rdi) movl %eax,60(%rdi) movl %r10d,48(%rdi) cmp $256,%rdx jb .L_kbits128 .L_kbits256: movl 16(%rsi),%edx movl 20(%rsi),%ecx movl 24(%rsi),%r8d movl 28(%rsi),%esi movl %edx,28(%rdi) movl %ecx,16(%rdi) movl %r8d,36(%rdi) movl %esi,56(%rdi) mov $1634760805,%rsi mov $857760878,%rdx mov $2036477234,%rcx mov $1797285236,%r8 movl %esi,0(%rdi) movl %edx,4(%rdi) movl %ecx,8(%rdi) movl %r8d,12(%rdi) jmp .L_keysetupdone .L_kbits128: movl 0(%rsi),%edx movl 4(%rsi),%ecx movl 8(%rsi),%r8d movl 12(%rsi),%esi movl %edx,28(%rdi) movl %ecx,16(%rdi) movl %r8d,36(%rdi) movl %esi,56(%rdi) mov $1634760805,%rsi mov $824206446,%rdx mov $2036477238,%rcx mov $1797285236,%r8 movl %esi,0(%rdi) movl %edx,4(%rdi) movl %ecx,8(%rdi) movl %r8d,12(%rdi) .L_keysetupdone: - ret + ret_spec_stop CFI_ENDPROC(); .align 8 .globl _gcry_salsa20_amd64_ivsetup ELF(.type _gcry_salsa20_amd64_ivsetup,@function;) _gcry_salsa20_amd64_ivsetup: CFI_STARTPROC(); movl 0(%rsi),%r8d movl 4(%rsi),%esi mov $0,%r9 mov $0,%rax movl %r8d,24(%rdi) movl %esi,44(%rdi) movl %r9d,32(%rdi) movl %eax,52(%rdi) - ret + ret_spec_stop CFI_ENDPROC(); .align 8 .globl _gcry_salsa20_amd64_encrypt_blocks ELF(.type _gcry_salsa20_amd64_encrypt_blocks,@function;) _gcry_salsa20_amd64_encrypt_blocks: /* * Modifications to original implementation: * - Number of rounds passing in register %r8 (for Salsa20/12). * - Length is input as number of blocks, so don't handle tail bytes * (this is done in salsa20.c). */ CFI_STARTPROC(); push %rbx CFI_PUSH(%rbx); shlq $6, %rcx /* blocks to bytes */ mov %r8, %rbx mov %rsp,%r11 CFI_DEF_CFA_REGISTER(%r11); sub $384,%rsp and $~31,%rsp mov %rdi,%r8 mov %rsi,%rsi mov %rdx,%rdi mov %rcx,%rdx cmp $0,%rdx jbe .L_done .L_start: cmp $256,%rdx jb .L_bytes_are_64_128_or_192 movdqa 0(%r8),%xmm0 pshufd $0x55,%xmm0,%xmm1 pshufd $0xaa,%xmm0,%xmm2 pshufd $0xff,%xmm0,%xmm3 pshufd $0x00,%xmm0,%xmm0 movdqa %xmm1,0(%rsp) movdqa %xmm2,16(%rsp) movdqa %xmm3,32(%rsp) movdqa %xmm0,48(%rsp) movdqa 16(%r8),%xmm0 pshufd $0xaa,%xmm0,%xmm1 pshufd $0xff,%xmm0,%xmm2 pshufd $0x00,%xmm0,%xmm3 pshufd $0x55,%xmm0,%xmm0 movdqa %xmm1,64(%rsp) movdqa %xmm2,80(%rsp) movdqa %xmm3,96(%rsp) movdqa %xmm0,112(%rsp) movdqa 32(%r8),%xmm0 pshufd $0xff,%xmm0,%xmm1 pshufd $0x55,%xmm0,%xmm2 pshufd $0xaa,%xmm0,%xmm0 movdqa %xmm1,128(%rsp) movdqa %xmm2,144(%rsp) movdqa %xmm0,160(%rsp) movdqa 48(%r8),%xmm0 pshufd $0x00,%xmm0,%xmm1 pshufd $0xaa,%xmm0,%xmm2 pshufd $0xff,%xmm0,%xmm0 movdqa %xmm1,176(%rsp) movdqa %xmm2,192(%rsp) movdqa %xmm0,208(%rsp) .L_bytesatleast256: movl 32(%r8),%ecx movl 52(%r8),%r9d movl %ecx,224(%rsp) movl %r9d,240(%rsp) add $1,%ecx adc $0,%r9d movl %ecx,4+224(%rsp) movl %r9d,4+240(%rsp) add $1,%ecx adc $0,%r9d movl %ecx,8+224(%rsp) movl %r9d,8+240(%rsp) add $1,%ecx adc $0,%r9d movl %ecx,12+224(%rsp) movl %r9d,12+240(%rsp) add $1,%ecx adc $0,%r9d movl %ecx,32(%r8) movl %r9d,52(%r8) movq %rdx,288(%rsp) mov %rbx,%rdx movdqa 0(%rsp),%xmm0 movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 movdqa 192(%rsp),%xmm3 movdqa 208(%rsp),%xmm4 movdqa 64(%rsp),%xmm5 movdqa 80(%rsp),%xmm6 movdqa 112(%rsp),%xmm7 movdqa 128(%rsp),%xmm8 movdqa 144(%rsp),%xmm9 movdqa 160(%rsp),%xmm10 movdqa 240(%rsp),%xmm11 movdqa 48(%rsp),%xmm12 movdqa 96(%rsp),%xmm13 movdqa 176(%rsp),%xmm14 movdqa 224(%rsp),%xmm15 .L_mainloop1: movdqa %xmm1,256(%rsp) movdqa %xmm2,272(%rsp) movdqa %xmm13,%xmm1 paddd %xmm12,%xmm1 movdqa %xmm1,%xmm2 pslld $7,%xmm1 pxor %xmm1,%xmm14 psrld $25,%xmm2 pxor %xmm2,%xmm14 movdqa %xmm7,%xmm1 paddd %xmm0,%xmm1 movdqa %xmm1,%xmm2 pslld $7,%xmm1 pxor %xmm1,%xmm11 psrld $25,%xmm2 pxor %xmm2,%xmm11 movdqa %xmm12,%xmm1 paddd %xmm14,%xmm1 movdqa %xmm1,%xmm2 pslld $9,%xmm1 pxor %xmm1,%xmm15 psrld $23,%xmm2 pxor %xmm2,%xmm15 movdqa %xmm0,%xmm1 paddd %xmm11,%xmm1 movdqa %xmm1,%xmm2 pslld $9,%xmm1 pxor %xmm1,%xmm9 psrld $23,%xmm2 pxor %xmm2,%xmm9 movdqa %xmm14,%xmm1 paddd %xmm15,%xmm1 movdqa %xmm1,%xmm2 pslld $13,%xmm1 pxor %xmm1,%xmm13 psrld $19,%xmm2 pxor %xmm2,%xmm13 movdqa %xmm11,%xmm1 paddd %xmm9,%xmm1 movdqa %xmm1,%xmm2 pslld $13,%xmm1 pxor %xmm1,%xmm7 psrld $19,%xmm2 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm1 paddd %xmm13,%xmm1 movdqa %xmm1,%xmm2 pslld $18,%xmm1 pxor %xmm1,%xmm12 psrld $14,%xmm2 pxor %xmm2,%xmm12 movdqa 256(%rsp),%xmm1 movdqa %xmm12,256(%rsp) movdqa %xmm9,%xmm2 paddd %xmm7,%xmm2 movdqa %xmm2,%xmm12 pslld $18,%xmm2 pxor %xmm2,%xmm0 psrld $14,%xmm12 pxor %xmm12,%xmm0 movdqa %xmm5,%xmm2 paddd %xmm1,%xmm2 movdqa %xmm2,%xmm12 pslld $7,%xmm2 pxor %xmm2,%xmm3 psrld $25,%xmm12 pxor %xmm12,%xmm3 movdqa 272(%rsp),%xmm2 movdqa %xmm0,272(%rsp) movdqa %xmm6,%xmm0 paddd %xmm2,%xmm0 movdqa %xmm0,%xmm12 pslld $7,%xmm0 pxor %xmm0,%xmm4 psrld $25,%xmm12 pxor %xmm12,%xmm4 movdqa %xmm1,%xmm0 paddd %xmm3,%xmm0 movdqa %xmm0,%xmm12 pslld $9,%xmm0 pxor %xmm0,%xmm10 psrld $23,%xmm12 pxor %xmm12,%xmm10 movdqa %xmm2,%xmm0 paddd %xmm4,%xmm0 movdqa %xmm0,%xmm12 pslld $9,%xmm0 pxor %xmm0,%xmm8 psrld $23,%xmm12 pxor %xmm12,%xmm8 movdqa %xmm3,%xmm0 paddd %xmm10,%xmm0 movdqa %xmm0,%xmm12 pslld $13,%xmm0 pxor %xmm0,%xmm5 psrld $19,%xmm12 pxor %xmm12,%xmm5 movdqa %xmm4,%xmm0 paddd %xmm8,%xmm0 movdqa %xmm0,%xmm12 pslld $13,%xmm0 pxor %xmm0,%xmm6 psrld $19,%xmm12 pxor %xmm12,%xmm6 movdqa %xmm10,%xmm0 paddd %xmm5,%xmm0 movdqa %xmm0,%xmm12 pslld $18,%xmm0 pxor %xmm0,%xmm1 psrld $14,%xmm12 pxor %xmm12,%xmm1 movdqa 256(%rsp),%xmm0 movdqa %xmm1,256(%rsp) movdqa %xmm4,%xmm1 paddd %xmm0,%xmm1 movdqa %xmm1,%xmm12 pslld $7,%xmm1 pxor %xmm1,%xmm7 psrld $25,%xmm12 pxor %xmm12,%xmm7 movdqa %xmm8,%xmm1 paddd %xmm6,%xmm1 movdqa %xmm1,%xmm12 pslld $18,%xmm1 pxor %xmm1,%xmm2 psrld $14,%xmm12 pxor %xmm12,%xmm2 movdqa 272(%rsp),%xmm12 movdqa %xmm2,272(%rsp) movdqa %xmm14,%xmm1 paddd %xmm12,%xmm1 movdqa %xmm1,%xmm2 pslld $7,%xmm1 pxor %xmm1,%xmm5 psrld $25,%xmm2 pxor %xmm2,%xmm5 movdqa %xmm0,%xmm1 paddd %xmm7,%xmm1 movdqa %xmm1,%xmm2 pslld $9,%xmm1 pxor %xmm1,%xmm10 psrld $23,%xmm2 pxor %xmm2,%xmm10 movdqa %xmm12,%xmm1 paddd %xmm5,%xmm1 movdqa %xmm1,%xmm2 pslld $9,%xmm1 pxor %xmm1,%xmm8 psrld $23,%xmm2 pxor %xmm2,%xmm8 movdqa %xmm7,%xmm1 paddd %xmm10,%xmm1 movdqa %xmm1,%xmm2 pslld $13,%xmm1 pxor %xmm1,%xmm4 psrld $19,%xmm2 pxor %xmm2,%xmm4 movdqa %xmm5,%xmm1 paddd %xmm8,%xmm1 movdqa %xmm1,%xmm2 pslld $13,%xmm1 pxor %xmm1,%xmm14 psrld $19,%xmm2 pxor %xmm2,%xmm14 movdqa %xmm10,%xmm1 paddd %xmm4,%xmm1 movdqa %xmm1,%xmm2 pslld $18,%xmm1 pxor %xmm1,%xmm0 psrld $14,%xmm2 pxor %xmm2,%xmm0 movdqa 256(%rsp),%xmm1 movdqa %xmm0,256(%rsp) movdqa %xmm8,%xmm0 paddd %xmm14,%xmm0 movdqa %xmm0,%xmm2 pslld $18,%xmm0 pxor %xmm0,%xmm12 psrld $14,%xmm2 pxor %xmm2,%xmm12 movdqa %xmm11,%xmm0 paddd %xmm1,%xmm0 movdqa %xmm0,%xmm2 pslld $7,%xmm0 pxor %xmm0,%xmm6 psrld $25,%xmm2 pxor %xmm2,%xmm6 movdqa 272(%rsp),%xmm2 movdqa %xmm12,272(%rsp) movdqa %xmm3,%xmm0 paddd %xmm2,%xmm0 movdqa %xmm0,%xmm12 pslld $7,%xmm0 pxor %xmm0,%xmm13 psrld $25,%xmm12 pxor %xmm12,%xmm13 movdqa %xmm1,%xmm0 paddd %xmm6,%xmm0 movdqa %xmm0,%xmm12 pslld $9,%xmm0 pxor %xmm0,%xmm15 psrld $23,%xmm12 pxor %xmm12,%xmm15 movdqa %xmm2,%xmm0 paddd %xmm13,%xmm0 movdqa %xmm0,%xmm12 pslld $9,%xmm0 pxor %xmm0,%xmm9 psrld $23,%xmm12 pxor %xmm12,%xmm9 movdqa %xmm6,%xmm0 paddd %xmm15,%xmm0 movdqa %xmm0,%xmm12 pslld $13,%xmm0 pxor %xmm0,%xmm11 psrld $19,%xmm12 pxor %xmm12,%xmm11 movdqa %xmm13,%xmm0 paddd %xmm9,%xmm0 movdqa %xmm0,%xmm12 pslld $13,%xmm0 pxor %xmm0,%xmm3 psrld $19,%xmm12 pxor %xmm12,%xmm3 movdqa %xmm15,%xmm0 paddd %xmm11,%xmm0 movdqa %xmm0,%xmm12 pslld $18,%xmm0 pxor %xmm0,%xmm1 psrld $14,%xmm12 pxor %xmm12,%xmm1 movdqa %xmm9,%xmm0 paddd %xmm3,%xmm0 movdqa %xmm0,%xmm12 pslld $18,%xmm0 pxor %xmm0,%xmm2 psrld $14,%xmm12 pxor %xmm12,%xmm2 movdqa 256(%rsp),%xmm12 movdqa 272(%rsp),%xmm0 sub $2,%rdx ja .L_mainloop1 paddd 48(%rsp),%xmm12 paddd 112(%rsp),%xmm7 paddd 160(%rsp),%xmm10 paddd 208(%rsp),%xmm4 movd %xmm12,%rdx movd %xmm7,%rcx movd %xmm10,%r9 movd %xmm4,%rax pshufd $0x39,%xmm12,%xmm12 pshufd $0x39,%xmm7,%xmm7 pshufd $0x39,%xmm10,%xmm10 pshufd $0x39,%xmm4,%xmm4 xorl 0(%rsi),%edx xorl 4(%rsi),%ecx xorl 8(%rsi),%r9d xorl 12(%rsi),%eax movl %edx,0(%rdi) movl %ecx,4(%rdi) movl %r9d,8(%rdi) movl %eax,12(%rdi) movd %xmm12,%rdx movd %xmm7,%rcx movd %xmm10,%r9 movd %xmm4,%rax pshufd $0x39,%xmm12,%xmm12 pshufd $0x39,%xmm7,%xmm7 pshufd $0x39,%xmm10,%xmm10 pshufd $0x39,%xmm4,%xmm4 xorl 64(%rsi),%edx xorl 68(%rsi),%ecx xorl 72(%rsi),%r9d xorl 76(%rsi),%eax movl %edx,64(%rdi) movl %ecx,68(%rdi) movl %r9d,72(%rdi) movl %eax,76(%rdi) movd %xmm12,%rdx movd %xmm7,%rcx movd %xmm10,%r9 movd %xmm4,%rax pshufd $0x39,%xmm12,%xmm12 pshufd $0x39,%xmm7,%xmm7 pshufd $0x39,%xmm10,%xmm10 pshufd $0x39,%xmm4,%xmm4 xorl 128(%rsi),%edx xorl 132(%rsi),%ecx xorl 136(%rsi),%r9d xorl 140(%rsi),%eax movl %edx,128(%rdi) movl %ecx,132(%rdi) movl %r9d,136(%rdi) movl %eax,140(%rdi) movd %xmm12,%rdx movd %xmm7,%rcx movd %xmm10,%r9 movd %xmm4,%rax xorl 192(%rsi),%edx xorl 196(%rsi),%ecx xorl 200(%rsi),%r9d xorl 204(%rsi),%eax movl %edx,192(%rdi) movl %ecx,196(%rdi) movl %r9d,200(%rdi) movl %eax,204(%rdi) paddd 176(%rsp),%xmm14 paddd 0(%rsp),%xmm0 paddd 64(%rsp),%xmm5 paddd 128(%rsp),%xmm8 movd %xmm14,%rdx movd %xmm0,%rcx movd %xmm5,%r9 movd %xmm8,%rax pshufd $0x39,%xmm14,%xmm14 pshufd $0x39,%xmm0,%xmm0 pshufd $0x39,%xmm5,%xmm5 pshufd $0x39,%xmm8,%xmm8 xorl 16(%rsi),%edx xorl 20(%rsi),%ecx xorl 24(%rsi),%r9d xorl 28(%rsi),%eax movl %edx,16(%rdi) movl %ecx,20(%rdi) movl %r9d,24(%rdi) movl %eax,28(%rdi) movd %xmm14,%rdx movd %xmm0,%rcx movd %xmm5,%r9 movd %xmm8,%rax pshufd $0x39,%xmm14,%xmm14 pshufd $0x39,%xmm0,%xmm0 pshufd $0x39,%xmm5,%xmm5 pshufd $0x39,%xmm8,%xmm8 xorl 80(%rsi),%edx xorl 84(%rsi),%ecx xorl 88(%rsi),%r9d xorl 92(%rsi),%eax movl %edx,80(%rdi) movl %ecx,84(%rdi) movl %r9d,88(%rdi) movl %eax,92(%rdi) movd %xmm14,%rdx movd %xmm0,%rcx movd %xmm5,%r9 movd %xmm8,%rax pshufd $0x39,%xmm14,%xmm14 pshufd $0x39,%xmm0,%xmm0 pshufd $0x39,%xmm5,%xmm5 pshufd $0x39,%xmm8,%xmm8 xorl 144(%rsi),%edx xorl 148(%rsi),%ecx xorl 152(%rsi),%r9d xorl 156(%rsi),%eax movl %edx,144(%rdi) movl %ecx,148(%rdi) movl %r9d,152(%rdi) movl %eax,156(%rdi) movd %xmm14,%rdx movd %xmm0,%rcx movd %xmm5,%r9 movd %xmm8,%rax xorl 208(%rsi),%edx xorl 212(%rsi),%ecx xorl 216(%rsi),%r9d xorl 220(%rsi),%eax movl %edx,208(%rdi) movl %ecx,212(%rdi) movl %r9d,216(%rdi) movl %eax,220(%rdi) paddd 224(%rsp),%xmm15 paddd 240(%rsp),%xmm11 paddd 16(%rsp),%xmm1 paddd 80(%rsp),%xmm6 movd %xmm15,%rdx movd %xmm11,%rcx movd %xmm1,%r9 movd %xmm6,%rax pshufd $0x39,%xmm15,%xmm15 pshufd $0x39,%xmm11,%xmm11 pshufd $0x39,%xmm1,%xmm1 pshufd $0x39,%xmm6,%xmm6 xorl 32(%rsi),%edx xorl 36(%rsi),%ecx xorl 40(%rsi),%r9d xorl 44(%rsi),%eax movl %edx,32(%rdi) movl %ecx,36(%rdi) movl %r9d,40(%rdi) movl %eax,44(%rdi) movd %xmm15,%rdx movd %xmm11,%rcx movd %xmm1,%r9 movd %xmm6,%rax pshufd $0x39,%xmm15,%xmm15 pshufd $0x39,%xmm11,%xmm11 pshufd $0x39,%xmm1,%xmm1 pshufd $0x39,%xmm6,%xmm6 xorl 96(%rsi),%edx xorl 100(%rsi),%ecx xorl 104(%rsi),%r9d xorl 108(%rsi),%eax movl %edx,96(%rdi) movl %ecx,100(%rdi) movl %r9d,104(%rdi) movl %eax,108(%rdi) movd %xmm15,%rdx movd %xmm11,%rcx movd %xmm1,%r9 movd %xmm6,%rax pshufd $0x39,%xmm15,%xmm15 pshufd $0x39,%xmm11,%xmm11 pshufd $0x39,%xmm1,%xmm1 pshufd $0x39,%xmm6,%xmm6 xorl 160(%rsi),%edx xorl 164(%rsi),%ecx xorl 168(%rsi),%r9d xorl 172(%rsi),%eax movl %edx,160(%rdi) movl %ecx,164(%rdi) movl %r9d,168(%rdi) movl %eax,172(%rdi) movd %xmm15,%rdx movd %xmm11,%rcx movd %xmm1,%r9 movd %xmm6,%rax xorl 224(%rsi),%edx xorl 228(%rsi),%ecx xorl 232(%rsi),%r9d xorl 236(%rsi),%eax movl %edx,224(%rdi) movl %ecx,228(%rdi) movl %r9d,232(%rdi) movl %eax,236(%rdi) paddd 96(%rsp),%xmm13 paddd 144(%rsp),%xmm9 paddd 192(%rsp),%xmm3 paddd 32(%rsp),%xmm2 movd %xmm13,%rdx movd %xmm9,%rcx movd %xmm3,%r9 movd %xmm2,%rax pshufd $0x39,%xmm13,%xmm13 pshufd $0x39,%xmm9,%xmm9 pshufd $0x39,%xmm3,%xmm3 pshufd $0x39,%xmm2,%xmm2 xorl 48(%rsi),%edx xorl 52(%rsi),%ecx xorl 56(%rsi),%r9d xorl 60(%rsi),%eax movl %edx,48(%rdi) movl %ecx,52(%rdi) movl %r9d,56(%rdi) movl %eax,60(%rdi) movd %xmm13,%rdx movd %xmm9,%rcx movd %xmm3,%r9 movd %xmm2,%rax pshufd $0x39,%xmm13,%xmm13 pshufd $0x39,%xmm9,%xmm9 pshufd $0x39,%xmm3,%xmm3 pshufd $0x39,%xmm2,%xmm2 xorl 112(%rsi),%edx xorl 116(%rsi),%ecx xorl 120(%rsi),%r9d xorl 124(%rsi),%eax movl %edx,112(%rdi) movl %ecx,116(%rdi) movl %r9d,120(%rdi) movl %eax,124(%rdi) movd %xmm13,%rdx movd %xmm9,%rcx movd %xmm3,%r9 movd %xmm2,%rax pshufd $0x39,%xmm13,%xmm13 pshufd $0x39,%xmm9,%xmm9 pshufd $0x39,%xmm3,%xmm3 pshufd $0x39,%xmm2,%xmm2 xorl 176(%rsi),%edx xorl 180(%rsi),%ecx xorl 184(%rsi),%r9d xorl 188(%rsi),%eax movl %edx,176(%rdi) movl %ecx,180(%rdi) movl %r9d,184(%rdi) movl %eax,188(%rdi) movd %xmm13,%rdx movd %xmm9,%rcx movd %xmm3,%r9 movd %xmm2,%rax xorl 240(%rsi),%edx xorl 244(%rsi),%ecx xorl 248(%rsi),%r9d xorl 252(%rsi),%eax movl %edx,240(%rdi) movl %ecx,244(%rdi) movl %r9d,248(%rdi) movl %eax,252(%rdi) movq 288(%rsp),%rdx sub $256,%rdx add $256,%rsi add $256,%rdi cmp $256,%rdx jae .L_bytesatleast256 cmp $0,%rdx jbe .L_done .L_bytes_are_64_128_or_192: movq %rdx,288(%rsp) movdqa 0(%r8),%xmm0 movdqa 16(%r8),%xmm1 movdqa 32(%r8),%xmm2 movdqa 48(%r8),%xmm3 movdqa %xmm1,%xmm4 mov %rbx,%rdx .L_mainloop2: paddd %xmm0,%xmm4 movdqa %xmm0,%xmm5 movdqa %xmm4,%xmm6 pslld $7,%xmm4 psrld $25,%xmm6 pxor %xmm4,%xmm3 pxor %xmm6,%xmm3 paddd %xmm3,%xmm5 movdqa %xmm3,%xmm4 movdqa %xmm5,%xmm6 pslld $9,%xmm5 psrld $23,%xmm6 pxor %xmm5,%xmm2 pshufd $0x93,%xmm3,%xmm3 pxor %xmm6,%xmm2 paddd %xmm2,%xmm4 movdqa %xmm2,%xmm5 movdqa %xmm4,%xmm6 pslld $13,%xmm4 psrld $19,%xmm6 pxor %xmm4,%xmm1 pshufd $0x4e,%xmm2,%xmm2 pxor %xmm6,%xmm1 paddd %xmm1,%xmm5 movdqa %xmm3,%xmm4 movdqa %xmm5,%xmm6 pslld $18,%xmm5 psrld $14,%xmm6 pxor %xmm5,%xmm0 pshufd $0x39,%xmm1,%xmm1 pxor %xmm6,%xmm0 paddd %xmm0,%xmm4 movdqa %xmm0,%xmm5 movdqa %xmm4,%xmm6 pslld $7,%xmm4 psrld $25,%xmm6 pxor %xmm4,%xmm1 pxor %xmm6,%xmm1 paddd %xmm1,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm5,%xmm6 pslld $9,%xmm5 psrld $23,%xmm6 pxor %xmm5,%xmm2 pshufd $0x93,%xmm1,%xmm1 pxor %xmm6,%xmm2 paddd %xmm2,%xmm4 movdqa %xmm2,%xmm5 movdqa %xmm4,%xmm6 pslld $13,%xmm4 psrld $19,%xmm6 pxor %xmm4,%xmm3 pshufd $0x4e,%xmm2,%xmm2 pxor %xmm6,%xmm3 paddd %xmm3,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm5,%xmm6 pslld $18,%xmm5 psrld $14,%xmm6 pxor %xmm5,%xmm0 pshufd $0x39,%xmm3,%xmm3 pxor %xmm6,%xmm0 paddd %xmm0,%xmm4 movdqa %xmm0,%xmm5 movdqa %xmm4,%xmm6 pslld $7,%xmm4 psrld $25,%xmm6 pxor %xmm4,%xmm3 pxor %xmm6,%xmm3 paddd %xmm3,%xmm5 movdqa %xmm3,%xmm4 movdqa %xmm5,%xmm6 pslld $9,%xmm5 psrld $23,%xmm6 pxor %xmm5,%xmm2 pshufd $0x93,%xmm3,%xmm3 pxor %xmm6,%xmm2 paddd %xmm2,%xmm4 movdqa %xmm2,%xmm5 movdqa %xmm4,%xmm6 pslld $13,%xmm4 psrld $19,%xmm6 pxor %xmm4,%xmm1 pshufd $0x4e,%xmm2,%xmm2 pxor %xmm6,%xmm1 paddd %xmm1,%xmm5 movdqa %xmm3,%xmm4 movdqa %xmm5,%xmm6 pslld $18,%xmm5 psrld $14,%xmm6 pxor %xmm5,%xmm0 pshufd $0x39,%xmm1,%xmm1 pxor %xmm6,%xmm0 paddd %xmm0,%xmm4 movdqa %xmm0,%xmm5 movdqa %xmm4,%xmm6 pslld $7,%xmm4 psrld $25,%xmm6 pxor %xmm4,%xmm1 pxor %xmm6,%xmm1 paddd %xmm1,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm5,%xmm6 pslld $9,%xmm5 psrld $23,%xmm6 pxor %xmm5,%xmm2 pshufd $0x93,%xmm1,%xmm1 pxor %xmm6,%xmm2 paddd %xmm2,%xmm4 movdqa %xmm2,%xmm5 movdqa %xmm4,%xmm6 pslld $13,%xmm4 psrld $19,%xmm6 pxor %xmm4,%xmm3 pshufd $0x4e,%xmm2,%xmm2 pxor %xmm6,%xmm3 sub $4,%rdx paddd %xmm3,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm5,%xmm6 pslld $18,%xmm5 pxor %xmm7,%xmm7 psrld $14,%xmm6 pxor %xmm5,%xmm0 pshufd $0x39,%xmm3,%xmm3 pxor %xmm6,%xmm0 ja .L_mainloop2 paddd 0(%r8),%xmm0 paddd 16(%r8),%xmm1 paddd 32(%r8),%xmm2 paddd 48(%r8),%xmm3 movd %xmm0,%rdx movd %xmm1,%rcx movd %xmm2,%rax movd %xmm3,%r10 pshufd $0x39,%xmm0,%xmm0 pshufd $0x39,%xmm1,%xmm1 pshufd $0x39,%xmm2,%xmm2 pshufd $0x39,%xmm3,%xmm3 xorl 0(%rsi),%edx xorl 48(%rsi),%ecx xorl 32(%rsi),%eax xorl 16(%rsi),%r10d movl %edx,0(%rdi) movl %ecx,48(%rdi) movl %eax,32(%rdi) movl %r10d,16(%rdi) movd %xmm0,%rdx movd %xmm1,%rcx movd %xmm2,%rax movd %xmm3,%r10 pshufd $0x39,%xmm0,%xmm0 pshufd $0x39,%xmm1,%xmm1 pshufd $0x39,%xmm2,%xmm2 pshufd $0x39,%xmm3,%xmm3 xorl 20(%rsi),%edx xorl 4(%rsi),%ecx xorl 52(%rsi),%eax xorl 36(%rsi),%r10d movl %edx,20(%rdi) movl %ecx,4(%rdi) movl %eax,52(%rdi) movl %r10d,36(%rdi) movd %xmm0,%rdx movd %xmm1,%rcx movd %xmm2,%rax movd %xmm3,%r10 pshufd $0x39,%xmm0,%xmm0 pshufd $0x39,%xmm1,%xmm1 pshufd $0x39,%xmm2,%xmm2 pshufd $0x39,%xmm3,%xmm3 xorl 40(%rsi),%edx xorl 24(%rsi),%ecx xorl 8(%rsi),%eax xorl 56(%rsi),%r10d movl %edx,40(%rdi) movl %ecx,24(%rdi) movl %eax,8(%rdi) movl %r10d,56(%rdi) movd %xmm0,%rdx movd %xmm1,%rcx movd %xmm2,%rax movd %xmm3,%r10 xorl 60(%rsi),%edx xorl 44(%rsi),%ecx xorl 28(%rsi),%eax xorl 12(%rsi),%r10d movl %edx,60(%rdi) movl %ecx,44(%rdi) movl %eax,28(%rdi) movl %r10d,12(%rdi) movq 288(%rsp),%rdx movl 32(%r8),%ecx movl 52(%r8),%eax add $1,%ecx adc $0,%eax movl %ecx,32(%r8) movl %eax,52(%r8) cmp $64,%rdx ja .L_bytes_are_128_or_192 .L_done: CFI_REMEMBER_STATE(); mov %r11,%rax sub %rsp,%rax mov %r11,%rsp CFI_REGISTER(%r11, %rsp) CFI_DEF_CFA_REGISTER(%rsp) pop %rbx CFI_POP(%rbx) - ret + ret_spec_stop CFI_RESTORE_STATE(); .L_bytes_are_128_or_192: sub $64,%rdx add $64,%rdi add $64,%rsi jmp .L_bytes_are_64_128_or_192 CFI_ENDPROC(); ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;) #endif /*defined(USE_SALSA20)*/ #endif /*__x86_64*/ diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S index dcee9b62..d3515a21 100644 --- a/cipher/serpent-avx2-amd64.S +++ b/cipher/serpent-avx2-amd64.S @@ -1,1160 +1,1160 @@ /* serpent-avx2-amd64.S - AVX2 implementation of Serpent cipher * * Copyright (C) 2013-2015 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #ifdef __x86_64 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \ defined(ENABLE_AVX2_SUPPORT) #include "asm-common-amd64.h" /* struct serpent_context: */ #define ctx_keys 0 /* register macros */ #define CTX %rdi /* vector registers */ #define RA0 %ymm0 #define RA1 %ymm1 #define RA2 %ymm2 #define RA3 %ymm3 #define RA4 %ymm4 #define RB0 %ymm5 #define RB1 %ymm6 #define RB2 %ymm7 #define RB3 %ymm8 #define RB4 %ymm9 #define RNOT %ymm10 #define RTMP0 %ymm11 #define RTMP1 %ymm12 #define RTMP2 %ymm13 #define RTMP3 %ymm14 #define RTMP4 %ymm15 #define RNOTx %xmm10 #define RTMP0x %xmm11 #define RTMP1x %xmm12 #define RTMP2x %xmm13 #define RTMP3x %xmm14 #define RTMP4x %xmm15 /********************************************************************** helper macros **********************************************************************/ /* vector 32-bit rotation to left */ #define vec_rol(reg, nleft, tmp) \ vpslld $(nleft), reg, tmp; \ vpsrld $(32 - (nleft)), reg, reg; \ vpor tmp, reg, reg; /* vector 32-bit rotation to right */ #define vec_ror(reg, nright, tmp) \ vec_rol(reg, 32 - nright, tmp) /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; /********************************************************************** 16-way serpent **********************************************************************/ /* * These are the S-Boxes of Serpent from following research paper. * * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference, * (New York, New York, USA), p. 317–329, National Institute of Standards and * Technology, 2000. * * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf * */ #define SBOX0(r0, r1, r2, r3, r4) \ vpxor r0, r3, r3; vmovdqa r1, r4; \ vpand r3, r1, r1; vpxor r2, r4, r4; \ vpxor r0, r1, r1; vpor r3, r0, r0; \ vpxor r4, r0, r0; vpxor r3, r4, r4; \ vpxor r2, r3, r3; vpor r1, r2, r2; \ vpxor r4, r2, r2; vpxor RNOT, r4, r4; \ vpor r1, r4, r4; vpxor r3, r1, r1; \ vpxor r4, r1, r1; vpor r0, r3, r3; \ vpxor r3, r1, r1; vpxor r3, r4, r4; #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \ vpxor RNOT, r2, r2; vmovdqa r1, r4; \ vpor r0, r1, r1; vpxor RNOT, r4, r4; \ vpxor r2, r1, r1; vpor r4, r2, r2; \ vpxor r3, r1, r1; vpxor r4, r0, r0; \ vpxor r0, r2, r2; vpand r3, r0, r0; \ vpxor r0, r4, r4; vpor r1, r0, r0; \ vpxor r2, r0, r0; vpxor r4, r3, r3; \ vpxor r1, r2, r2; vpxor r0, r3, r3; \ vpxor r1, r3, r3; \ vpand r3, r2, r2; \ vpxor r2, r4, r4; #define SBOX1(r0, r1, r2, r3, r4) \ vpxor RNOT, r0, r0; vpxor RNOT, r2, r2; \ vmovdqa r0, r4; vpand r1, r0, r0; \ vpxor r0, r2, r2; vpor r3, r0, r0; \ vpxor r2, r3, r3; vpxor r0, r1, r1; \ vpxor r4, r0, r0; vpor r1, r4, r4; \ vpxor r3, r1, r1; vpor r0, r2, r2; \ vpand r4, r2, r2; vpxor r1, r0, r0; \ vpand r2, r1, r1; \ vpxor r0, r1, r1; vpand r2, r0, r0; \ vpxor r4, r0, r0; #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r1, r4; vpxor r3, r1, r1; \ vpand r1, r3, r3; vpxor r2, r4, r4; \ vpxor r0, r3, r3; vpor r1, r0, r0; \ vpxor r3, r2, r2; vpxor r4, r0, r0; \ vpor r2, r0, r0; vpxor r3, r1, r1; \ vpxor r1, r0, r0; vpor r3, r1, r1; \ vpxor r0, r1, r1; vpxor RNOT, r4, r4; \ vpxor r1, r4, r4; vpor r0, r1, r1; \ vpxor r0, r1, r1; \ vpor r4, r1, r1; \ vpxor r1, r3, r3; #define SBOX2(r0, r1, r2, r3, r4) \ vmovdqa r0, r4; vpand r2, r0, r0; \ vpxor r3, r0, r0; vpxor r1, r2, r2; \ vpxor r0, r2, r2; vpor r4, r3, r3; \ vpxor r1, r3, r3; vpxor r2, r4, r4; \ vmovdqa r3, r1; vpor r4, r3, r3; \ vpxor r0, r3, r3; vpand r1, r0, r0; \ vpxor r0, r4, r4; vpxor r3, r1, r1; \ vpxor r4, r1, r1; vpxor RNOT, r4, r4; #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \ vpxor r3, r2, r2; vpxor r0, r3, r3; \ vmovdqa r3, r4; vpand r2, r3, r3; \ vpxor r1, r3, r3; vpor r2, r1, r1; \ vpxor r4, r1, r1; vpand r3, r4, r4; \ vpxor r3, r2, r2; vpand r0, r4, r4; \ vpxor r2, r4, r4; vpand r1, r2, r2; \ vpor r0, r2, r2; vpxor RNOT, r3, r3; \ vpxor r3, r2, r2; vpxor r3, r0, r0; \ vpand r1, r0, r0; vpxor r4, r3, r3; \ vpxor r0, r3, r3; #define SBOX3(r0, r1, r2, r3, r4) \ vmovdqa r0, r4; vpor r3, r0, r0; \ vpxor r1, r3, r3; vpand r4, r1, r1; \ vpxor r2, r4, r4; vpxor r3, r2, r2; \ vpand r0, r3, r3; vpor r1, r4, r4; \ vpxor r4, r3, r3; vpxor r1, r0, r0; \ vpand r0, r4, r4; vpxor r3, r1, r1; \ vpxor r2, r4, r4; vpor r0, r1, r1; \ vpxor r2, r1, r1; vpxor r3, r0, r0; \ vmovdqa r1, r2; vpor r3, r1, r1; \ vpxor r0, r1, r1; #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r2, r4; vpxor r1, r2, r2; \ vpxor r2, r0, r0; vpand r2, r4, r4; \ vpxor r0, r4, r4; vpand r1, r0, r0; \ vpxor r3, r1, r1; vpor r4, r3, r3; \ vpxor r3, r2, r2; vpxor r3, r0, r0; \ vpxor r4, r1, r1; vpand r2, r3, r3; \ vpxor r1, r3, r3; vpxor r0, r1, r1; \ vpor r2, r1, r1; vpxor r3, r0, r0; \ vpxor r4, r1, r1; \ vpxor r1, r0, r0; #define SBOX4(r0, r1, r2, r3, r4) \ vpxor r3, r1, r1; vpxor RNOT, r3, r3; \ vpxor r3, r2, r2; vpxor r0, r3, r3; \ vmovdqa r1, r4; vpand r3, r1, r1; \ vpxor r2, r1, r1; vpxor r3, r4, r4; \ vpxor r4, r0, r0; vpand r4, r2, r2; \ vpxor r0, r2, r2; vpand r1, r0, r0; \ vpxor r0, r3, r3; vpor r1, r4, r4; \ vpxor r0, r4, r4; vpor r3, r0, r0; \ vpxor r2, r0, r0; vpand r3, r2, r2; \ vpxor RNOT, r0, r0; vpxor r2, r4, r4; #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r2, r4; vpand r3, r2, r2; \ vpxor r1, r2, r2; vpor r3, r1, r1; \ vpand r0, r1, r1; vpxor r2, r4, r4; \ vpxor r1, r4, r4; vpand r2, r1, r1; \ vpxor RNOT, r0, r0; vpxor r4, r3, r3; \ vpxor r3, r1, r1; vpand r0, r3, r3; \ vpxor r2, r3, r3; vpxor r1, r0, r0; \ vpand r0, r2, r2; vpxor r0, r3, r3; \ vpxor r4, r2, r2; \ vpor r3, r2, r2; vpxor r0, r3, r3; \ vpxor r1, r2, r2; #define SBOX5(r0, r1, r2, r3, r4) \ vpxor r1, r0, r0; vpxor r3, r1, r1; \ vpxor RNOT, r3, r3; vmovdqa r1, r4; \ vpand r0, r1, r1; vpxor r3, r2, r2; \ vpxor r2, r1, r1; vpor r4, r2, r2; \ vpxor r3, r4, r4; vpand r1, r3, r3; \ vpxor r0, r3, r3; vpxor r1, r4, r4; \ vpxor r2, r4, r4; vpxor r0, r2, r2; \ vpand r3, r0, r0; vpxor RNOT, r2, r2; \ vpxor r4, r0, r0; vpor r3, r4, r4; \ vpxor r4, r2, r2; #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \ vpxor RNOT, r1, r1; vmovdqa r3, r4; \ vpxor r1, r2, r2; vpor r0, r3, r3; \ vpxor r2, r3, r3; vpor r1, r2, r2; \ vpand r0, r2, r2; vpxor r3, r4, r4; \ vpxor r4, r2, r2; vpor r0, r4, r4; \ vpxor r1, r4, r4; vpand r2, r1, r1; \ vpxor r3, r1, r1; vpxor r2, r4, r4; \ vpand r4, r3, r3; vpxor r1, r4, r4; \ vpxor r4, r3, r3; vpxor RNOT, r4, r4; \ vpxor r0, r3, r3; #define SBOX6(r0, r1, r2, r3, r4) \ vpxor RNOT, r2, r2; vmovdqa r3, r4; \ vpand r0, r3, r3; vpxor r4, r0, r0; \ vpxor r2, r3, r3; vpor r4, r2, r2; \ vpxor r3, r1, r1; vpxor r0, r2, r2; \ vpor r1, r0, r0; vpxor r1, r2, r2; \ vpxor r0, r4, r4; vpor r3, r0, r0; \ vpxor r2, r0, r0; vpxor r3, r4, r4; \ vpxor r0, r4, r4; vpxor RNOT, r3, r3; \ vpand r4, r2, r2; \ vpxor r3, r2, r2; #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \ vpxor r2, r0, r0; vmovdqa r2, r4; \ vpand r0, r2, r2; vpxor r3, r4, r4; \ vpxor RNOT, r2, r2; vpxor r1, r3, r3; \ vpxor r3, r2, r2; vpor r0, r4, r4; \ vpxor r2, r0, r0; vpxor r4, r3, r3; \ vpxor r1, r4, r4; vpand r3, r1, r1; \ vpxor r0, r1, r1; vpxor r3, r0, r0; \ vpor r2, r0, r0; vpxor r1, r3, r3; \ vpxor r0, r4, r4; #define SBOX7(r0, r1, r2, r3, r4) \ vmovdqa r1, r4; vpor r2, r1, r1; \ vpxor r3, r1, r1; vpxor r2, r4, r4; \ vpxor r1, r2, r2; vpor r4, r3, r3; \ vpand r0, r3, r3; vpxor r2, r4, r4; \ vpxor r1, r3, r3; vpor r4, r1, r1; \ vpxor r0, r1, r1; vpor r4, r0, r0; \ vpxor r2, r0, r0; vpxor r4, r1, r1; \ vpxor r1, r2, r2; vpand r0, r1, r1; \ vpxor r4, r1, r1; vpxor RNOT, r2, r2; \ vpor r0, r2, r2; \ vpxor r2, r4, r4; #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \ vmovdqa r2, r4; vpxor r0, r2, r2; \ vpand r3, r0, r0; vpor r3, r4, r4; \ vpxor RNOT, r2, r2; vpxor r1, r3, r3; \ vpor r0, r1, r1; vpxor r2, r0, r0; \ vpand r4, r2, r2; vpand r4, r3, r3; \ vpxor r2, r1, r1; vpxor r0, r2, r2; \ vpor r2, r0, r0; vpxor r1, r4, r4; \ vpxor r3, r0, r0; vpxor r4, r3, r3; \ vpor r0, r4, r4; vpxor r2, r3, r3; \ vpxor r2, r4, r4; /* Apply SBOX number WHICH to to the block. */ #define SBOX(which, r0, r1, r2, r3, r4) \ SBOX##which (r0, r1, r2, r3, r4) /* Apply inverse SBOX number WHICH to to the block. */ #define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \ SBOX##which##_INVERSE (r0, r1, r2, r3, r4) /* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary. */ #define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \ vpbroadcastd (ctx_keys + (round) * 16 + 0 * 4)(CTX), r4; \ vpxor r4, r0, r0; \ vpbroadcastd (ctx_keys + (round) * 16 + 1 * 4)(CTX), r4; \ vpxor r4, r1, r1; \ vpbroadcastd (ctx_keys + (round) * 16 + 2 * 4)(CTX), r4; \ vpxor r4, r2, r2; \ vpbroadcastd (ctx_keys + (round) * 16 + 3 * 4)(CTX), r4; \ vpxor r4, r3, r3; /* Apply the linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \ vec_rol(r0, 13, r4); \ vec_rol(r2, 3, r4); \ vpxor r0, r1, r1; \ vpxor r2, r1, r1; \ vpslld $3, r0, r4; \ vpxor r2, r3, r3; \ vpxor r4, r3, r3; \ vec_rol(r1, 1, r4); \ vec_rol(r3, 7, r4); \ vpxor r1, r0, r0; \ vpxor r3, r0, r0; \ vpslld $7, r1, r4; \ vpxor r3, r2, r2; \ vpxor r4, r2, r2; \ vec_rol(r0, 5, r4); \ vec_rol(r2, 22, r4); /* Apply the inverse linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \ vec_ror(r2, 22, r4); \ vec_ror(r0, 5, r4); \ vpslld $7, r1, r4; \ vpxor r3, r2, r2; \ vpxor r4, r2, r2; \ vpxor r1, r0, r0; \ vpxor r3, r0, r0; \ vec_ror(r3, 7, r4); \ vec_ror(r1, 1, r4); \ vpslld $3, r0, r4; \ vpxor r2, r3, r3; \ vpxor r4, r3, r3; \ vpxor r0, r1, r1; \ vpxor r2, r1, r1; \ vec_ror(r2, 3, r4); \ vec_ror(r0, 13, r4); /* Apply a Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4); /* Apply the last Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1)); /* Apply an inverse Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \ LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); /* Apply the first inverse Serpent round to sixteen parallel blocks. This macro increments `round'. */ #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); .text .align 8 ELF(.type __serpent_enc_blk16,@function;) __serpent_enc_blk16: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * plaintext blocks * output: * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel * ciphertext blocks */ CFI_STARTPROC(); vpcmpeqd RNOT, RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0, RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0); ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0, RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0); ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2, RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2); ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4, RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4); ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0, RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0); ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0, RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0); ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3, RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3); ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0, RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0); ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4, RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4); ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4, RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4); ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2, RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2); ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3); ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4, RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4); ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4, RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4); ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0, RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0); ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4, RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4); ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1); transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;) .align 8 ELF(.type __serpent_dec_blk16,@function;) __serpent_dec_blk16: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * ciphertext blocks * output: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * plaintext blocks */ CFI_STARTPROC(); vpcmpeqd RNOT, RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4, RA3, RA0, RA1, RA4, RA2, RB0, RB1, RB2, RB3, RB4, RB3, RB0, RB1, RB4, RB2); ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3); ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0, RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0); ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3, RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3); ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3, RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3); ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4, RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4); ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3, RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3); ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1, RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1); ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2, RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2); ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0, RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0); ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4, RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4); ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0, RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0); ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0, RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0); ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1, RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1); ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0, RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0); ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3, RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3); ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2, RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2); ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4, RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4); ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1, RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1); ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4, RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4); ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4, RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4); ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3, RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3); ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4, RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4); ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0, RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0); ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2, RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2); ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1, RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1); ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3, RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3); ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1, RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1); ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1, RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1); ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0, RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0); ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1, RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1); ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4, RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4); transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; .align 8 .globl _gcry_serpent_avx2_ctr_enc ELF(.type _gcry_serpent_avx2_ctr_enc,@function;) _gcry_serpent_avx2_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); movq 8(%rcx), %rax; bswapq %rax; vzeroupper; vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ /* load IV and byteswap */ vmovdqu (%rcx), RTMP4x; vpshufb RTMP3x, RTMP4x, RTMP4x; vmovdqa RTMP4x, RTMP0x; inc_le128(RTMP4x, RNOTx, RTMP1x); vinserti128 $1, RTMP4x, RTMP0, RTMP0; vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */ /* check need for handling 64-bit overflow and carry */ cmpq $(0xffffffffffffffff - 16), %rax; ja .Lhandle_ctr_carry; /* construct IVs */ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */ vpshufb RTMP3, RTMP0, RA1; vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */ vpshufb RTMP3, RTMP0, RA2; vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */ vpshufb RTMP3, RTMP0, RA3; vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */ vpshufb RTMP3, RTMP0, RB0; vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */ vpshufb RTMP3, RTMP0, RB1; vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */ vpshufb RTMP3, RTMP0, RB2; vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */ vpshufb RTMP3, RTMP0, RB3; vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ vpshufb RTMP3x, RTMP0x, RTMP0x; jmp .Lctr_carry_done; .Lhandle_ctr_carry: /* construct IVs */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */ inc_le128(RTMP0, RNOT, RTMP1); vextracti128 $1, RTMP0, RTMP0x; vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ .align 4 .Lctr_carry_done: /* store new IV */ vmovdqu RTMP0x, (%rcx); call __serpent_enc_blk16; vpxor (0 * 32)(%rdx), RA4, RA4; vpxor (1 * 32)(%rdx), RA1, RA1; vpxor (2 * 32)(%rdx), RA2, RA2; vpxor (3 * 32)(%rdx), RA0, RA0; vpxor (4 * 32)(%rdx), RB4, RB4; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RB2, RB2; vpxor (7 * 32)(%rdx), RB0, RB0; vmovdqu RA4, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA0, (3 * 32)(%rsi); vmovdqu RB4, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB0, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;) .align 8 .globl _gcry_serpent_avx2_cbc_dec ELF(.type _gcry_serpent_avx2_cbc_dec,@function;) _gcry_serpent_avx2_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; vmovdqu (0 * 32)(%rdx), RA0; vmovdqu (1 * 32)(%rdx), RA1; vmovdqu (2 * 32)(%rdx), RA2; vmovdqu (3 * 32)(%rdx), RA3; vmovdqu (4 * 32)(%rdx), RB0; vmovdqu (5 * 32)(%rdx), RB1; vmovdqu (6 * 32)(%rdx), RB2; vmovdqu (7 * 32)(%rdx), RB3; call __serpent_dec_blk16; vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RNOT; vpxor RNOT, RA0, RA0; vpxor (0 * 32 + 16)(%rdx), RA1, RA1; vpxor (1 * 32 + 16)(%rdx), RA2, RA2; vpxor (2 * 32 + 16)(%rdx), RA3, RA3; vpxor (3 * 32 + 16)(%rdx), RB0, RB0; vpxor (4 * 32 + 16)(%rdx), RB1, RB1; vpxor (5 * 32 + 16)(%rdx), RB2, RB2; vpxor (6 * 32 + 16)(%rdx), RB3, RB3; vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); /* store new IV */ vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA3, (3 * 32)(%rsi); vmovdqu RB0, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB3, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;) .align 8 .globl _gcry_serpent_avx2_cfb_dec ELF(.type _gcry_serpent_avx2_cfb_dec,@function;) _gcry_serpent_avx2_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; /* Load input */ vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RA0; vmovdqu (0 * 32 + 16)(%rdx), RA1; vmovdqu (1 * 32 + 16)(%rdx), RA2; vmovdqu (2 * 32 + 16)(%rdx), RA3; vmovdqu (3 * 32 + 16)(%rdx), RB0; vmovdqu (4 * 32 + 16)(%rdx), RB1; vmovdqu (5 * 32 + 16)(%rdx), RB2; vmovdqu (6 * 32 + 16)(%rdx), RB3; /* Update IV */ vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); call __serpent_enc_blk16; vpxor (0 * 32)(%rdx), RA4, RA4; vpxor (1 * 32)(%rdx), RA1, RA1; vpxor (2 * 32)(%rdx), RA2, RA2; vpxor (3 * 32)(%rdx), RA0, RA0; vpxor (4 * 32)(%rdx), RB4, RB4; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RB2, RB2; vpxor (7 * 32)(%rdx), RB0, RB0; vmovdqu RA4, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA0, (3 * 32)(%rsi); vmovdqu RB4, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB0, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;) .align 8 .globl _gcry_serpent_avx2_ocb_enc ELF(.type _gcry_serpent_avx2_ocb_enc,@function;) _gcry_serpent_avx2_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; vmovdqu (%r8), RTMP1x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RTMP1, RTMP1; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vextracti128 $1, RTMP1, RNOTx; vmovdqu RTMP0x, (%rcx); vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%r8); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor (0 * 32)(%rsi), RA4, RA4; vpxor (1 * 32)(%rsi), RA1, RA1; vpxor (2 * 32)(%rsi), RA2, RA2; vpxor (3 * 32)(%rsi), RA0, RA0; vpxor (4 * 32)(%rsi), RB4, RB4; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RB2, RB2; vpxor (7 * 32)(%rsi), RB0, RB0; vmovdqu RA4, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA0, (3 * 32)(%rsi); vmovdqu RB4, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB0, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;) .align 8 .globl _gcry_serpent_avx2_ocb_dec ELF(.type _gcry_serpent_avx2_ocb_dec,@function;) _gcry_serpent_avx2_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0x, (%rcx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_dec_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vmovdqu (%r8), RTMP1x; vpxor (0 * 32)(%rsi), RA0, RA0; vpxor (1 * 32)(%rsi), RA1, RA1; vpxor (2 * 32)(%rsi), RA2, RA2; vpxor (3 * 32)(%rsi), RA3, RA3; vpxor (4 * 32)(%rsi), RB0, RB0; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RB2, RB2; vpxor (7 * 32)(%rsi), RB3, RB3; /* Checksum_i = Checksum_{i-1} xor P_i */ vmovdqu RA0, (0 * 32)(%rsi); vpxor RA0, RTMP1, RTMP1; vmovdqu RA1, (1 * 32)(%rsi); vpxor RA1, RTMP1, RTMP1; vmovdqu RA2, (2 * 32)(%rsi); vpxor RA2, RTMP1, RTMP1; vmovdqu RA3, (3 * 32)(%rsi); vpxor RA3, RTMP1, RTMP1; vmovdqu RB0, (4 * 32)(%rsi); vpxor RB0, RTMP1, RTMP1; vmovdqu RB1, (5 * 32)(%rsi); vpxor RB1, RTMP1, RTMP1; vmovdqu RB2, (6 * 32)(%rsi); vpxor RB2, RTMP1, RTMP1; vmovdqu RB3, (7 * 32)(%rsi); vpxor RB3, RTMP1, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%r8); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;) .align 8 .globl _gcry_serpent_avx2_ocb_auth ELF(.type _gcry_serpent_avx2_ocb_auth,@function;) _gcry_serpent_avx2_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (16 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rdx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rsi), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r8), %r10; movq (9 * 8)(%r8), %r11; movq (10 * 8)(%r8), %r12; movq (11 * 8)(%r8), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r8), %r10; movq (13 * 8)(%r8), %r11; movq (14 * 8)(%r8), %r12; movq (15 * 8)(%r8), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0x, (%rdx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor RA4, RB4, RA4; vpxor RA1, RB1, RA1; vpxor RA2, RB2, RA2; vpxor RA0, RB0, RA0; vpxor RA4, RA1, RA1; vpxor RA2, RA0, RA0; vpxor RA1, RA0, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor (%rcx), RTMP1x, RTMP1x; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%rcx); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;) .align 16 /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 #endif /*defined(USE_SERPENT) && defined(ENABLE_AVX2_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index 39cba002..b5935095 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -1,1211 +1,1211 @@ /* serpent-sse2-amd64.S - SSE2 implementation of Serpent cipher * * Copyright (C) 2013-2015 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) #include "asm-common-amd64.h" /* struct serpent_context: */ #define ctx_keys 0 /* register macros */ #define CTX %rdi /* vector registers */ #define RA0 %xmm0 #define RA1 %xmm1 #define RA2 %xmm2 #define RA3 %xmm3 #define RA4 %xmm4 #define RB0 %xmm5 #define RB1 %xmm6 #define RB2 %xmm7 #define RB3 %xmm8 #define RB4 %xmm9 #define RNOT %xmm10 #define RTMP0 %xmm11 #define RTMP1 %xmm12 #define RTMP2 %xmm13 /********************************************************************** helper macros **********************************************************************/ /* vector 32-bit rotation to left */ #define vec_rol(reg, nleft, tmp) \ movdqa reg, tmp; \ pslld $(nleft), tmp; \ psrld $(32 - (nleft)), reg; \ por tmp, reg; /* vector 32-bit rotation to right */ #define vec_ror(reg, nright, tmp) \ vec_rol(reg, 32 - nright, tmp) /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ movdqa x0, t2; \ punpckhdq x1, t2; \ punpckldq x1, x0; \ \ movdqa x2, t1; \ punpckldq x3, t1; \ punpckhdq x3, x2; \ \ movdqa x0, x1; \ punpckhqdq t1, x1; \ punpcklqdq t1, x0; \ \ movdqa t2, x3; \ punpckhqdq x2, x3; \ punpcklqdq x2, t2; \ movdqa t2, x2; /* fill xmm register with 32-bit value from memory */ #define pbroadcastd(mem32, xreg) \ movd mem32, xreg; \ pshufd $0, xreg, xreg; /* xor with unaligned memory operand */ #define pxor_u(umem128, xreg, t) \ movdqu umem128, t; \ pxor t, xreg; /* 128-bit wide byte swap */ #define pbswap(xreg, t0) \ /* reorder 32-bit words, [a,b,c,d] => [d,c,b,a] */ \ pshufd $0x1b, xreg, xreg; \ /* reorder high&low 16-bit words, [d0,d1,c0,c1] => [d1,d0,c1,c0] */ \ pshuflw $0xb1, xreg, xreg; \ pshufhw $0xb1, xreg, xreg; \ /* reorder bytes in 16-bit words */ \ movdqa xreg, t0; \ psrlw $8, t0; \ psllw $8, xreg; \ por t0, xreg; /********************************************************************** 8-way serpent **********************************************************************/ /* * These are the S-Boxes of Serpent from following research paper. * * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference, * (New York, New York, USA), p. 317–329, National Institute of Standards and * Technology, 2000. * * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf * */ #define SBOX0(r0, r1, r2, r3, r4) \ pxor r0, r3; movdqa r1, r4; \ pand r3, r1; pxor r2, r4; \ pxor r0, r1; por r3, r0; \ pxor r4, r0; pxor r3, r4; \ pxor r2, r3; por r1, r2; \ pxor r4, r2; pxor RNOT, r4; \ por r1, r4; pxor r3, r1; \ pxor r4, r1; por r0, r3; \ pxor r3, r1; pxor r3, r4; #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \ pxor RNOT, r2; movdqa r1, r4; \ por r0, r1; pxor RNOT, r4; \ pxor r2, r1; por r4, r2; \ pxor r3, r1; pxor r4, r0; \ pxor r0, r2; pand r3, r0; \ pxor r0, r4; por r1, r0; \ pxor r2, r0; pxor r4, r3; \ pxor r1, r2; pxor r0, r3; \ pxor r1, r3; \ pand r3, r2; \ pxor r2, r4; #define SBOX1(r0, r1, r2, r3, r4) \ pxor RNOT, r0; pxor RNOT, r2; \ movdqa r0, r4; pand r1, r0; \ pxor r0, r2; por r3, r0; \ pxor r2, r3; pxor r0, r1; \ pxor r4, r0; por r1, r4; \ pxor r3, r1; por r0, r2; \ pand r4, r2; pxor r1, r0; \ pand r2, r1; \ pxor r0, r1; pand r2, r0; \ pxor r4, r0; #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \ movdqa r1, r4; pxor r3, r1; \ pand r1, r3; pxor r2, r4; \ pxor r0, r3; por r1, r0; \ pxor r3, r2; pxor r4, r0; \ por r2, r0; pxor r3, r1; \ pxor r1, r0; por r3, r1; \ pxor r0, r1; pxor RNOT, r4; \ pxor r1, r4; por r0, r1; \ pxor r0, r1; \ por r4, r1; \ pxor r1, r3; #define SBOX2(r0, r1, r2, r3, r4) \ movdqa r0, r4; pand r2, r0; \ pxor r3, r0; pxor r1, r2; \ pxor r0, r2; por r4, r3; \ pxor r1, r3; pxor r2, r4; \ movdqa r3, r1; por r4, r3; \ pxor r0, r3; pand r1, r0; \ pxor r0, r4; pxor r3, r1; \ pxor r4, r1; pxor RNOT, r4; #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \ pxor r3, r2; pxor r0, r3; \ movdqa r3, r4; pand r2, r3; \ pxor r1, r3; por r2, r1; \ pxor r4, r1; pand r3, r4; \ pxor r3, r2; pand r0, r4; \ pxor r2, r4; pand r1, r2; \ por r0, r2; pxor RNOT, r3; \ pxor r3, r2; pxor r3, r0; \ pand r1, r0; pxor r4, r3; \ pxor r0, r3; #define SBOX3(r0, r1, r2, r3, r4) \ movdqa r0, r4; por r3, r0; \ pxor r1, r3; pand r4, r1; \ pxor r2, r4; pxor r3, r2; \ pand r0, r3; por r1, r4; \ pxor r4, r3; pxor r1, r0; \ pand r0, r4; pxor r3, r1; \ pxor r2, r4; por r0, r1; \ pxor r2, r1; pxor r3, r0; \ movdqa r1, r2; por r3, r1; \ pxor r0, r1; #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pxor r1, r2; \ pxor r2, r0; pand r2, r4; \ pxor r0, r4; pand r1, r0; \ pxor r3, r1; por r4, r3; \ pxor r3, r2; pxor r3, r0; \ pxor r4, r1; pand r2, r3; \ pxor r1, r3; pxor r0, r1; \ por r2, r1; pxor r3, r0; \ pxor r4, r1; \ pxor r1, r0; #define SBOX4(r0, r1, r2, r3, r4) \ pxor r3, r1; pxor RNOT, r3; \ pxor r3, r2; pxor r0, r3; \ movdqa r1, r4; pand r3, r1; \ pxor r2, r1; pxor r3, r4; \ pxor r4, r0; pand r4, r2; \ pxor r0, r2; pand r1, r0; \ pxor r0, r3; por r1, r4; \ pxor r0, r4; por r3, r0; \ pxor r2, r0; pand r3, r2; \ pxor RNOT, r0; pxor r2, r4; #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pand r3, r2; \ pxor r1, r2; por r3, r1; \ pand r0, r1; pxor r2, r4; \ pxor r1, r4; pand r2, r1; \ pxor RNOT, r0; pxor r4, r3; \ pxor r3, r1; pand r0, r3; \ pxor r2, r3; pxor r1, r0; \ pand r0, r2; pxor r0, r3; \ pxor r4, r2; \ por r3, r2; pxor r0, r3; \ pxor r1, r2; #define SBOX5(r0, r1, r2, r3, r4) \ pxor r1, r0; pxor r3, r1; \ pxor RNOT, r3; movdqa r1, r4; \ pand r0, r1; pxor r3, r2; \ pxor r2, r1; por r4, r2; \ pxor r3, r4; pand r1, r3; \ pxor r0, r3; pxor r1, r4; \ pxor r2, r4; pxor r0, r2; \ pand r3, r0; pxor RNOT, r2; \ pxor r4, r0; por r3, r4; \ pxor r4, r2; #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \ pxor RNOT, r1; movdqa r3, r4; \ pxor r1, r2; por r0, r3; \ pxor r2, r3; por r1, r2; \ pand r0, r2; pxor r3, r4; \ pxor r4, r2; por r0, r4; \ pxor r1, r4; pand r2, r1; \ pxor r3, r1; pxor r2, r4; \ pand r4, r3; pxor r1, r4; \ pxor r4, r3; pxor RNOT, r4; \ pxor r0, r3; #define SBOX6(r0, r1, r2, r3, r4) \ pxor RNOT, r2; movdqa r3, r4; \ pand r0, r3; pxor r4, r0; \ pxor r2, r3; por r4, r2; \ pxor r3, r1; pxor r0, r2; \ por r1, r0; pxor r1, r2; \ pxor r0, r4; por r3, r0; \ pxor r2, r0; pxor r3, r4; \ pxor r0, r4; pxor RNOT, r3; \ pand r4, r2; \ pxor r3, r2; #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \ pxor r2, r0; movdqa r2, r4; \ pand r0, r2; pxor r3, r4; \ pxor RNOT, r2; pxor r1, r3; \ pxor r3, r2; por r0, r4; \ pxor r2, r0; pxor r4, r3; \ pxor r1, r4; pand r3, r1; \ pxor r0, r1; pxor r3, r0; \ por r2, r0; pxor r1, r3; \ pxor r0, r4; #define SBOX7(r0, r1, r2, r3, r4) \ movdqa r1, r4; por r2, r1; \ pxor r3, r1; pxor r2, r4; \ pxor r1, r2; por r4, r3; \ pand r0, r3; pxor r2, r4; \ pxor r1, r3; por r4, r1; \ pxor r0, r1; por r4, r0; \ pxor r2, r0; pxor r4, r1; \ pxor r1, r2; pand r0, r1; \ pxor r4, r1; pxor RNOT, r2; \ por r0, r2; \ pxor r2, r4; #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pxor r0, r2; \ pand r3, r0; por r3, r4; \ pxor RNOT, r2; pxor r1, r3; \ por r0, r1; pxor r2, r0; \ pand r4, r2; pand r4, r3; \ pxor r2, r1; pxor r0, r2; \ por r2, r0; pxor r1, r4; \ pxor r3, r0; pxor r4, r3; \ por r0, r4; pxor r2, r3; \ pxor r2, r4; /* Apply SBOX number WHICH to to the block. */ #define SBOX(which, r0, r1, r2, r3, r4) \ SBOX##which (r0, r1, r2, r3, r4) /* Apply inverse SBOX number WHICH to to the block. */ #define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \ SBOX##which##_INVERSE (r0, r1, r2, r3, r4) /* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary. */ #define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \ pbroadcastd ((ctx_keys + (round) * 16 + 0 * 4)(CTX), r4); \ pxor r4, r0; \ pbroadcastd ((ctx_keys + (round) * 16 + 1 * 4)(CTX), r4); \ pxor r4, r1; \ pbroadcastd ((ctx_keys + (round) * 16 + 2 * 4)(CTX), r4); \ pxor r4, r2; \ pbroadcastd ((ctx_keys + (round) * 16 + 3 * 4)(CTX), r4); \ pxor r4, r3; /* Apply the linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \ vec_rol(r0, 13, r4); \ vec_rol(r2, 3, r4); \ pxor r0, r1; \ pxor r2, r1; \ movdqa r0, r4; \ pslld $3, r4; \ pxor r2, r3; \ pxor r4, r3; \ vec_rol(r1, 1, r4); \ vec_rol(r3, 7, r4); \ pxor r1, r0; \ pxor r3, r0; \ movdqa r1, r4; \ pslld $7, r4; \ pxor r3, r2; \ pxor r4, r2; \ vec_rol(r0, 5, r4); \ vec_rol(r2, 22, r4); /* Apply the inverse linear transformation to BLOCK. */ #define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \ vec_ror(r2, 22, r4); \ vec_ror(r0, 5, r4); \ movdqa r1, r4; \ pslld $7, r4; \ pxor r3, r2; \ pxor r4, r2; \ pxor r1, r0; \ pxor r3, r0; \ vec_ror(r3, 7, r4); \ vec_ror(r1, 1, r4); \ movdqa r0, r4; \ pslld $3, r4; \ pxor r2, r3; \ pxor r4, r3; \ pxor r0, r1; \ pxor r2, r1; \ vec_ror(r2, 3, r4); \ vec_ror(r0, 13, r4); /* Apply a Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4); /* Apply the last Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ SBOX (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ SBOX (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1)); /* Apply an inverse Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \ LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); /* Apply the first inverse Serpent round to eight parallel blocks. This macro increments `round'. */ #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \ na0, na1, na2, na3, na4, \ b0, b1, b2, b3, b4, \ nb0, nb1, nb2, nb3, nb4) \ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); .text .align 8 ELF(.type __serpent_enc_blk8,@function;) __serpent_enc_blk8: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks * output: * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel * ciphertext blocks */ CFI_STARTPROC(); pcmpeqd RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0, RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0); ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0, RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0); ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2, RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2); ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4, RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4); ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0, RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0); ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0, RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0); ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3, RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3); ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0, RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0); ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4, RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4); ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4, RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4); ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2, RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2); ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3); ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4, RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4); ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4, RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4); ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0, RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0); ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4, RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4); ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1); transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;) .align 8 ELF(.type __serpent_dec_blk8,@function;) __serpent_dec_blk8: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel * ciphertext blocks * output: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks */ CFI_STARTPROC(); pcmpeqd RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4, RA3, RA0, RA1, RA4, RA2, RB0, RB1, RB2, RB3, RB4, RB3, RB0, RB1, RB4, RB2); ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3, RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3); ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0, RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0); ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3, RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3); ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3, RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3); ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4, RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4); ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3, RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3); ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1, RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1); ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2, RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2); ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0, RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0); ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4, RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4); ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0, RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0); ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0, RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0); ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1, RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1); ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0, RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0); ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3, RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3); ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2, RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2); ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4, RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4); ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1, RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1); ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4, RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4); ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4, RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4); ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3, RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3); ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4, RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4); ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0, RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0); ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2, RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2); ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1, RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1); ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3, RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3); ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1, RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1); ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1, RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1); ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0, RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0); ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1, RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1); ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4, RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4); transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;) .align 8 .globl _gcry_serpent_sse2_ctr_enc ELF(.type _gcry_serpent_sse2_ctr_enc,@function;) _gcry_serpent_sse2_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); /* load IV and byteswap */ movdqu (%rcx), RA0; movdqa RA0, RTMP0; pbswap(RTMP0, RTMP1); /* be => le */ pcmpeqd RNOT, RNOT; psrldq $8, RNOT; /* low: -1, high: 0 */ movdqa RNOT, RTMP2; paddq RTMP2, RTMP2; /* low: -2, high: 0 */ /* construct IVs */ movdqa RTMP0, RTMP1; psubq RNOT, RTMP0; /* +1 */ movdqa RTMP0, RA1; psubq RTMP2, RTMP1; /* +2 */ movdqa RTMP1, RA2; psubq RTMP2, RTMP0; /* +3 */ movdqa RTMP0, RA3; psubq RTMP2, RTMP1; /* +4 */ movdqa RTMP1, RB0; psubq RTMP2, RTMP0; /* +5 */ movdqa RTMP0, RB1; psubq RTMP2, RTMP1; /* +6 */ movdqa RTMP1, RB2; psubq RTMP2, RTMP0; /* +7 */ movdqa RTMP0, RB3; psubq RTMP2, RTMP1; /* +8 */ /* check need for handling 64-bit overflow and carry */ cmpl $0xffffffff, 8(%rcx); jne .Lno_ctr_carry; movl 12(%rcx), %eax; bswapl %eax; cmpl $-8, %eax; jb .Lno_ctr_carry; pslldq $8, RNOT; /* low: 0, high: -1 */ je .Lcarry_RTMP0; cmpl $-6, %eax; jb .Lcarry_RB3; je .Lcarry_RB2; cmpl $-4, %eax; jb .Lcarry_RB1; je .Lcarry_RB0; cmpl $-2, %eax; jb .Lcarry_RA3; je .Lcarry_RA2; psubq RNOT, RA1; .Lcarry_RA2: psubq RNOT, RA2; .Lcarry_RA3: psubq RNOT, RA3; .Lcarry_RB0: psubq RNOT, RB0; .Lcarry_RB1: psubq RNOT, RB1; .Lcarry_RB2: psubq RNOT, RB2; .Lcarry_RB3: psubq RNOT, RB3; .Lcarry_RTMP0: psubq RNOT, RTMP1; .Lno_ctr_carry: /* le => be */ pbswap(RA1, RTMP0); pbswap(RA2, RTMP0); pbswap(RA3, RTMP0); pbswap(RB0, RTMP0); pbswap(RB1, RTMP0); pbswap(RB2, RTMP0); pbswap(RB3, RTMP0); pbswap(RTMP1, RTMP0); /* store new IV */ movdqu RTMP1, (%rcx); call __serpent_enc_blk8; pxor_u((0 * 16)(%rdx), RA4, RTMP0); pxor_u((1 * 16)(%rdx), RA1, RTMP0); pxor_u((2 * 16)(%rdx), RA2, RTMP0); pxor_u((3 * 16)(%rdx), RA0, RTMP0); pxor_u((4 * 16)(%rdx), RB4, RTMP0); pxor_u((5 * 16)(%rdx), RB1, RTMP0); pxor_u((6 * 16)(%rdx), RB2, RTMP0); pxor_u((7 * 16)(%rdx), RB0, RTMP0); movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA0, (3 * 16)(%rsi); movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;) .align 8 .globl _gcry_serpent_sse2_cbc_dec ELF(.type _gcry_serpent_sse2_cbc_dec,@function;) _gcry_serpent_sse2_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv */ CFI_STARTPROC(); movdqu (0 * 16)(%rdx), RA0; movdqu (1 * 16)(%rdx), RA1; movdqu (2 * 16)(%rdx), RA2; movdqu (3 * 16)(%rdx), RA3; movdqu (4 * 16)(%rdx), RB0; movdqu (5 * 16)(%rdx), RB1; movdqu (6 * 16)(%rdx), RB2; movdqu (7 * 16)(%rdx), RB3; call __serpent_dec_blk8; movdqu (7 * 16)(%rdx), RNOT; pxor_u((%rcx), RA0, RTMP0); pxor_u((0 * 16)(%rdx), RA1, RTMP0); pxor_u((1 * 16)(%rdx), RA2, RTMP0); pxor_u((2 * 16)(%rdx), RA3, RTMP0); pxor_u((3 * 16)(%rdx), RB0, RTMP0); pxor_u((4 * 16)(%rdx), RB1, RTMP0); pxor_u((5 * 16)(%rdx), RB2, RTMP0); pxor_u((6 * 16)(%rdx), RB3, RTMP0); movdqu RNOT, (%rcx); /* store new IV */ movdqu RA0, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA3, (3 * 16)(%rsi); movdqu RB0, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB3, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;) .align 8 .globl _gcry_serpent_sse2_cfb_dec ELF(.type _gcry_serpent_sse2_cfb_dec,@function;) _gcry_serpent_sse2_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv */ CFI_STARTPROC(); /* Load input */ movdqu (%rcx), RA0; movdqu 0 * 16(%rdx), RA1; movdqu 1 * 16(%rdx), RA2; movdqu 2 * 16(%rdx), RA3; movdqu 3 * 16(%rdx), RB0; movdqu 4 * 16(%rdx), RB1; movdqu 5 * 16(%rdx), RB2; movdqu 6 * 16(%rdx), RB3; /* Update IV */ movdqu 7 * 16(%rdx), RNOT; movdqu RNOT, (%rcx); call __serpent_enc_blk8; pxor_u((0 * 16)(%rdx), RA4, RTMP0); pxor_u((1 * 16)(%rdx), RA1, RTMP0); pxor_u((2 * 16)(%rdx), RA2, RTMP0); pxor_u((3 * 16)(%rdx), RA0, RTMP0); pxor_u((4 * 16)(%rdx), RB4, RTMP0); pxor_u((5 * 16)(%rdx), RB1, RTMP0); pxor_u((6 * 16)(%rdx), RB2, RTMP0); pxor_u((7 * 16)(%rdx), RB0, RTMP0); movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA0, (3 * 16)(%rsi); movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;) .align 8 .globl _gcry_serpent_sse2_ocb_enc ELF(.type _gcry_serpent_sse2_ocb_enc,@function;) _gcry_serpent_sse2_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rcx), RTMP0; movdqu (%r8), RTMP1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ movdqu (n * 16)(%rdx), xreg; \ movdqu (lreg), RNOT; \ pxor RNOT, RTMP0; \ pxor xreg, RTMP1; \ pxor RTMP0, xreg; \ movdqu RTMP0, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT movdqu RTMP0, (%rcx); movdqu RTMP1, (%r8); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); pxor_u((0 * 16)(%rsi), RA4, RTMP0); pxor_u((1 * 16)(%rsi), RA1, RTMP0); pxor_u((2 * 16)(%rsi), RA2, RTMP0); pxor_u((3 * 16)(%rsi), RA0, RTMP0); pxor_u((4 * 16)(%rsi), RB4, RTMP0); pxor_u((5 * 16)(%rsi), RB1, RTMP0); pxor_u((6 * 16)(%rsi), RB2, RTMP0); pxor_u((7 * 16)(%rsi), RB0, RTMP0); movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); movdqu RA0, (3 * 16)(%rsi); movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;) .align 8 .globl _gcry_serpent_sse2_ocb_dec ELF(.type _gcry_serpent_sse2_ocb_dec,@function;) _gcry_serpent_sse2_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rcx), RTMP0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ movdqu (n * 16)(%rdx), xreg; \ movdqu (lreg), RNOT; \ pxor RNOT, RTMP0; \ pxor RTMP0, xreg; \ movdqu RTMP0, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT movdqu RTMP0, (%rcx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_dec_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); movdqu (%r8), RTMP0; pxor_u((0 * 16)(%rsi), RA0, RTMP1); pxor_u((1 * 16)(%rsi), RA1, RTMP1); pxor_u((2 * 16)(%rsi), RA2, RTMP1); pxor_u((3 * 16)(%rsi), RA3, RTMP1); pxor_u((4 * 16)(%rsi), RB0, RTMP1); pxor_u((5 * 16)(%rsi), RB1, RTMP1); pxor_u((6 * 16)(%rsi), RB2, RTMP1); pxor_u((7 * 16)(%rsi), RB3, RTMP1); /* Checksum_i = Checksum_{i-1} xor P_i */ movdqu RA0, (0 * 16)(%rsi); pxor RA0, RTMP0; movdqu RA1, (1 * 16)(%rsi); pxor RA1, RTMP0; movdqu RA2, (2 * 16)(%rsi); pxor RA2, RTMP0; movdqu RA3, (3 * 16)(%rsi); pxor RA3, RTMP0; movdqu RB0, (4 * 16)(%rsi); pxor RB0, RTMP0; movdqu RB1, (5 * 16)(%rsi); pxor RB1, RTMP0; movdqu RB2, (6 * 16)(%rsi); pxor RB2, RTMP0; movdqu RB3, (7 * 16)(%rsi); pxor RB3, RTMP0; movdqu RTMP0, (%r8); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;) .align 8 .globl _gcry_serpent_sse2_ocb_auth ELF(.type _gcry_serpent_sse2_ocb_auth,@function;) _gcry_serpent_sse2_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (8 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rdx), RTMP0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ movdqu (n * 16)(%rsi), xreg; \ movdqu (lreg), RNOT; \ pxor RNOT, RTMP0; \ pxor RTMP0, xreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT movdqu RTMP0, (%rdx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __serpent_enc_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); movdqu (%rcx), RTMP0; pxor RB4, RA4; pxor RB1, RA1; pxor RB2, RA2; pxor RB0, RA0; pxor RTMP0, RA2; pxor RA4, RA1; pxor RA2, RA0; pxor RA1, RA0; movdqu RA0, (%rcx); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;) #endif /*defined(USE_SERPENT)*/ #endif /*__x86_64*/ diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S index 85876ad4..acada960 100644 --- a/cipher/sha1-avx-amd64.S +++ b/cipher/sha1-avx-amd64.S @@ -1,429 +1,429 @@ /* sha1-avx-amd64.S - Intel AVX accelerated SHA-1 transform function * Copyright (C) 2013 Jussi Kivilinna * * Based on sha1.c: * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Intel SSSE3 accelerated SHA-1 implementation based on white paper: * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 */ #ifdef __x86_64__ #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1) #include "asm-common-amd64.h" /* Context structure */ #define state_h0 0 #define state_h1 4 #define state_h2 8 #define state_h3 12 #define state_h4 16 /* Constants */ .text #define K1 0x5A827999 #define K2 0x6ED9EBA1 #define K3 0x8F1BBCDC #define K4 0xCA62C1D6 .align 16 .LK_XMM: .LK1: .long K1, K1, K1, K1 .LK2: .long K2, K2, K2, K2 .LK3: .long K3, K3, K3, K3 .LK4: .long K4, K4, K4, K4 .Lbswap_shufb_ctl: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f /* Register macros */ #define RSTATE %r8 #define RDATA %r9 #define ROLDSTACK %r10 #define RNBLKS %r11 #define a %eax #define b %ebx #define c %ecx #define d %edx #define e %edi #define RT0 %esi #define RT1 %ebp #define Wtmp0 %xmm0 #define Wtmp1 %xmm1 #define W0 %xmm2 #define W1 %xmm3 #define W2 %xmm4 #define W3 %xmm5 #define W4 %xmm6 #define W5 %xmm7 #define W6 %xmm8 #define W7 %xmm9 #define BSWAP_REG %xmm10 /* Round function macros. */ #define WK(i) (((i) & 15) * 4)(%rsp) #define R_F1(a,b,c,d,e,i) \ movl c, RT0; \ addl WK(i), e; \ xorl d, RT0; \ movl a, RT1; \ andl b, RT0; \ shldl $30, b, b; \ xorl d, RT0; \ leal (RT0,e), e; \ shldl $5, RT1, RT1; \ addl RT1, e; #define R_F2(a,b,c,d,e,i) \ movl c, RT0; \ addl WK(i), e; \ xorl b, RT0; \ shldl $30, b, b; \ xorl d, RT0; \ movl a, RT1; \ leal (RT0,e), e; \ shldl $5, RT1, RT1; \ addl RT1, e; #define R_F3(a,b,c,d,e,i) \ movl c, RT0; \ movl b, RT1; \ xorl b, RT0; \ andl c, RT1; \ andl d, RT0; \ addl RT1, e; \ addl WK(i), e; \ shldl $30, b, b; \ movl a, RT1; \ leal (RT0,e), e; \ shldl $5, RT1, RT1; \ addl RT1, e; #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) #define R(a,b,c,d,e,f,i) \ R_##f(a,b,c,d,e,i) /* Input expansion macros. */ #define W_PRECALC_00_15_0(i, W, tmp0) \ vmovdqu (4*(i))(RDATA), tmp0; #define W_PRECALC_00_15_1(i, W, tmp0) \ vpshufb BSWAP_REG, tmp0, W; #define W_PRECALC_00_15_2(i, W, tmp0) \ vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; #define W_PRECALC_00_15_3(i, W, tmp0) \ vmovdqa tmp0, WK(i&~3); #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpalignr $8, W_m16, W_m12, W; \ vpsrldq $4, W_m04, tmp0; \ vpxor W_m08, W, W; #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpxor W_m16, tmp0, tmp0; \ vpxor tmp0, W, W; \ vpslld $1, W, tmp0; \ vpslldq $12, W, tmp1; \ vpsrld $31, W, W; #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpor W, tmp0, tmp0; \ vpsrld $30, tmp1, W; \ vpslld $2, tmp1, tmp1; #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpxor W, tmp0, tmp0; \ vpxor tmp1, tmp0, W; \ vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpxor W_m28, W, W; \ vpalignr $8, W_m08, W_m04, tmp0; #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpxor W_m16, W, W; \ vpxor tmp0, W, W; #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpsrld $30, W, tmp0; \ vpslld $2, W, W; #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpor W, tmp0, W; \ vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * * unsigned int * _gcry_sha1_transform_amd64_avx (void *ctx, const unsigned char *data, * size_t nblks) */ .globl _gcry_sha1_transform_amd64_avx ELF(.type _gcry_sha1_transform_amd64_avx,@function) .align 16 _gcry_sha1_transform_amd64_avx: /* input: * %rdi: ctx, CTX * %rsi: data (64*nblks bytes) * %rdx: nblks */ CFI_STARTPROC(); xorl %eax, %eax; cmpq $0, %rdx; jz .Lret; vzeroupper; movq %rdx, RNBLKS; movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; CFI_PUSH(%rbx); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, ROLDSTACK; CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(16*4), %rsp; andq $(~31), %rsp; /* Get the values of the chaining variables. */ movl state_h0(RSTATE), a; movl state_h1(RSTATE), b; movl state_h2(RSTATE), c; movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG; /* Precalc 0-15. */ W_PRECALC_00_15_0(0, W0, Wtmp0); W_PRECALC_00_15_1(1, W0, Wtmp0); W_PRECALC_00_15_2(2, W0, Wtmp0); W_PRECALC_00_15_3(3, W0, Wtmp0); W_PRECALC_00_15_0(4, W7, Wtmp0); W_PRECALC_00_15_1(5, W7, Wtmp0); W_PRECALC_00_15_2(6, W7, Wtmp0); W_PRECALC_00_15_3(7, W7, Wtmp0); W_PRECALC_00_15_0(8, W6, Wtmp0); W_PRECALC_00_15_1(9, W6, Wtmp0); W_PRECALC_00_15_2(10, W6, Wtmp0); W_PRECALC_00_15_3(11, W6, Wtmp0); W_PRECALC_00_15_0(12, W5, Wtmp0); W_PRECALC_00_15_1(13, W5, Wtmp0); W_PRECALC_00_15_2(14, W5, Wtmp0); W_PRECALC_00_15_3(15, W5, Wtmp0); .align 8 .Loop: addq $64, RDATA; /* Transform 0-15 + Precalc 16-31. */ R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); /* Transform 16-63 + Precalc 32-79. */ R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); decq RNBLKS; jz .Lend; /* Transform 64-79 + Precalc 0-15 of next block. */ R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0); R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0); R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0); R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); R( c, d, e, a, b, F4, 78 ); addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0); R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); jmp .Loop; .align 16 .Lend: vzeroall; /* Transform 64-79 + burn stack */ R( b, c, d, e, a, F4, 64 ); R( a, b, c, d, e, F4, 65 ); R( e, a, b, c, d, F4, 66 ); R( d, e, a, b, c, F4, 67 ); R( c, d, e, a, b, F4, 68 ); R( b, c, d, e, a, F4, 69 ); R( a, b, c, d, e, F4, 70 ); R( e, a, b, c, d, F4, 71 ); R( d, e, a, b, c, F4, 72 ); R( c, d, e, a, b, F4, 73 ); R( b, c, d, e, a, F4, 74 ); R( a, b, c, d, e, F4, 75 ); R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp); R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp); R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); /* 16*4/16-1 = 3 */ vmovdqa %xmm0, (3*16)(%rsp); /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; CFI_REGISTER(ROLDSTACK, %rsp); CFI_DEF_CFA_REGISTER(%rsp); popq %rbp; CFI_POP(%rbp); popq %rbx; CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; .Lret: - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx, .-_gcry_sha1_transform_amd64_avx;) #endif #endif diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S index 5dfcdca9..5f4b9e69 100644 --- a/cipher/sha1-avx-bmi2-amd64.S +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -1,441 +1,441 @@ /* sha1-avx-bmi2-amd64.S - Intel AVX/BMI2 accelerated SHA-1 transform function * Copyright (C) 2013 Jussi Kivilinna * * Based on sha1.c: * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Intel SSSE3 accelerated SHA-1 implementation based on white paper: * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 */ #ifdef __x86_64__ #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1) #include "asm-common-amd64.h" /* Context structure */ #define state_h0 0 #define state_h1 4 #define state_h2 8 #define state_h3 12 #define state_h4 16 /* Constants */ .text .align 16 .Lbswap_shufb_ctl: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f .LK1: .long 0x5A827999 .LK2: .long 0x6ED9EBA1 .LK3: .long 0x8F1BBCDC .LK4: .long 0xCA62C1D6 /* Register macros */ #define RSTATE %r8 #define RDATA %r9 #define ROLDSTACK %r10 #define RNBLKS %r11 #define a %esi #define b %edi #define c %ebp #define d %edx #define e %ecx #define ne %ebx #define RT0 %eax #define RT1 %r12d #define Wtmp0 %xmm0 #define Wtmp1 %xmm1 #define W0 %xmm2 #define W1 %xmm3 #define W2 %xmm4 #define W3 %xmm5 #define W4 %xmm6 #define W5 %xmm7 #define W6 %xmm8 #define W7 %xmm9 #define BSWAP_REG %xmm10 #define K1 %xmm11 #define K2 %xmm12 #define K3 %xmm13 #define K4 %xmm14 /* Round function macros. */ #define WK(i) (((i) & 15) * 4)(%rsp) #define R_F1(a,b,c,d,e,i) \ movl c, RT0; \ andn d, b, RT1; \ addl WK(i), e; \ andl b, RT0; \ rorxl $2, b, b; \ addl RT1, e; \ addl ne, a; \ leal (RT0,e), ne; \ rorxl $27, a, e; #define R_F2(a,b,c,d,e,i) \ movl c, RT0; \ addl WK(i), e; \ xorl b, RT0; \ rorxl $2, b, b; \ xorl d, RT0; \ addl ne, a; \ leal (RT0,e), ne; \ rorxl $27, a, e; #define R_F3(a,b,c,d,e,i) \ movl c, RT0; \ movl b, RT1; \ addl WK(i), e; \ xorl b, RT0; \ andl c, RT1; \ andl d, RT0; \ addl RT1, e; \ rorxl $2, b, b; \ addl ne, a; \ leal (RT0,e), ne; \ rorxl $27, a, e; #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) #define R(a,b,c,d,e,f,i) \ R_##f(a,b,c,d,e,i) /* Input expansion macros. */ #define W_PRECALC_00_15_0(i, W, tmp0) \ vmovdqu (4*(i))(RDATA), tmp0; #define W_PRECALC_00_15_1(i, W, tmp0) \ vpshufb BSWAP_REG, tmp0, W; #define W_PRECALC_00_15_2(i, W, tmp0, K) \ vpaddd K, W, tmp0; #define W_PRECALC_00_15_3(i, W, tmp0) \ vmovdqa tmp0, WK(i&~3); #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpalignr $8, W_m16, W_m12, W; \ vpsrldq $4, W_m04, tmp0; \ vpxor W_m08, W, W; #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpxor W_m16, tmp0, tmp0; \ vpxor tmp0, W, W; \ vpslld $1, W, tmp0; \ vpslldq $12, W, tmp1; \ vpsrld $31, W, W; #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpor W, tmp0, tmp0; \ vpsrld $30, tmp1, W; \ vpslld $2, tmp1, tmp1; #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \ vpxor W, tmp0, tmp0; \ vpxor tmp1, tmp0, W; \ vpaddd K, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpxor W_m28, W, W; \ vpalignr $8, W_m08, W_m04, tmp0; #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpxor W_m16, W, W; \ vpxor tmp0, W, W; #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpsrld $30, W, tmp0; \ vpslld $2, W, W; #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \ vpor W, tmp0, W; \ vpaddd K, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * * unsigned int * _gcry_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data, * size_t nblks) */ .globl _gcry_sha1_transform_amd64_avx_bmi2 ELF(.type _gcry_sha1_transform_amd64_avx_bmi2,@function) .align 16 _gcry_sha1_transform_amd64_avx_bmi2: /* input: * %rdi: ctx, CTX * %rsi: data (64*nblks bytes) * %rdx: nblks */ CFI_STARTPROC(); xorl %eax, %eax; cmpq $0, %rdx; jz .Lret; vzeroupper; movq %rdx, RNBLKS; movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; CFI_PUSH(%rbx); pushq %rbp; CFI_PUSH(%rbp); pushq %r12; CFI_PUSH(%r12); movq %rsp, ROLDSTACK; CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(16*4), %rsp; andq $(~31), %rsp; /* Get the values of the chaining variables. */ movl state_h0(RSTATE), a; movl state_h1(RSTATE), b; movl state_h2(RSTATE), c; movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; xorl ne, ne; vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG; vpbroadcastd .LK1 rRIP, K1; vpbroadcastd .LK2 rRIP, K2; vpbroadcastd .LK3 rRIP, K3; vpbroadcastd .LK4 rRIP, K4; /* Precalc 0-15. */ W_PRECALC_00_15_0(0, W0, Wtmp0); W_PRECALC_00_15_1(1, W0, Wtmp0); W_PRECALC_00_15_2(2, W0, Wtmp0, K1); W_PRECALC_00_15_3(3, W0, Wtmp0); W_PRECALC_00_15_0(4, W7, Wtmp0); W_PRECALC_00_15_1(5, W7, Wtmp0); W_PRECALC_00_15_2(6, W7, Wtmp0, K1); W_PRECALC_00_15_3(7, W7, Wtmp0); W_PRECALC_00_15_0(8, W6, Wtmp0); W_PRECALC_00_15_1(9, W6, Wtmp0); W_PRECALC_00_15_2(10, W6, Wtmp0, K1); W_PRECALC_00_15_3(11, W6, Wtmp0); W_PRECALC_00_15_0(12, W5, Wtmp0); W_PRECALC_00_15_1(13, W5, Wtmp0); W_PRECALC_00_15_2(14, W5, Wtmp0, K1); W_PRECALC_00_15_3(15, W5, Wtmp0); .align 8 .Loop: addq $64, RDATA; /* Transform 0-15 + Precalc 16-31. */ R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1); R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2); R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2); R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2); /* Transform 16-63 + Precalc 32-79. */ R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2); R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2); R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3); R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3); R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3); R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3); R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3); R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4); R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4); R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4); R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4); R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4); decq RNBLKS; jz .Lend; /* Transform 64-79 + Precalc 0-15 of next block. */ R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1); R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1); R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1); R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); R( c, d, e, a, b, F4, 78 ); addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0, K1); R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); addl ne, a; xorl ne, ne; /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); jmp .Loop; .align 16 .Lend: vzeroall; /* Transform 64-79 + burn stack */ R( b, c, d, e, a, F4, 64 ); R( a, b, c, d, e, F4, 65 ); R( e, a, b, c, d, F4, 66 ); R( d, e, a, b, c, F4, 67 ); R( c, d, e, a, b, F4, 68 ); R( b, c, d, e, a, F4, 69 ); R( a, b, c, d, e, F4, 70 ); R( e, a, b, c, d, F4, 71 ); R( d, e, a, b, c, F4, 72 ); R( c, d, e, a, b, F4, 73 ); R( b, c, d, e, a, F4, 74 ); R( a, b, c, d, e, F4, 75 ); R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp); R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp); R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); addl ne, a; xorl ne, ne; /* 16*4/16-1 = 3 */ vmovdqa %xmm0, (3*16)(%rsp); /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; CFI_REGISTER(ROLDSTACK, %rsp); CFI_DEF_CFA_REGISTER(%rsp); popq %r12; CFI_POP(%r12); popq %rbp; CFI_POP(%rbp); popq %rbx; CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; .Lret: - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx_bmi2, .-_gcry_sha1_transform_amd64_avx_bmi2;) #endif #endif diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S index 93863230..ed52761b 100644 --- a/cipher/sha1-avx2-bmi2-amd64.S +++ b/cipher/sha1-avx2-bmi2-amd64.S @@ -1,573 +1,573 @@ /* sha1-avx2-bmi2-amd64.S - Intel AVX2/BMI2 accelerated SHA-1 transform function * Copyright (C) 2019 Jussi Kivilinna * * Based on sha1.c: * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Intel SSSE3 accelerated SHA-1 implementation based on white paper: * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 */ #ifdef __x86_64__ #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1) #include "asm-common-amd64.h" /* Context structure */ #define state_h0 0 #define state_h1 4 #define state_h2 8 #define state_h3 12 #define state_h4 16 /* Constants */ #define WK_STACK_WORDS (80 * 2) .text .align 16 .Lbswap_shufb_ctl: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f .LK1: .long 0x5A827999 .LK2: .long 0x6ED9EBA1 .LK3: .long 0x8F1BBCDC .LK4: .long 0xCA62C1D6 /* Register macros */ #define RSTATE %r8 #define RDATA %r9 #define ROLDSTACK %r10 #define RNBLKS %r11 #define a %eax #define b %ebx #define c %ecx #define d %edx #define e %edi #define ne %r12d #define RT0 %esi #define RT1 %ebp #define Wtmp0 %ymm0 #define Wtmp1 %ymm1 #define Wtmp0x %xmm0 #define Wtmp1x %xmm1 #define W0 %ymm2 #define W1 %ymm3 #define W2 %ymm4 #define W3 %ymm5 #define W4 %ymm6 #define W5 %ymm7 #define W6 %ymm8 #define W7 %ymm9 #define BSWAP_REG %ymm10 #define K1 %ymm11 #define K2 %ymm12 #define K3 %ymm13 #define K4 %ymm14 /* Round function macros. */ #define WK(i,block) ((block) * 16 + ((i) / 4) * 32 + ((i) % 4) * 4)(%rsp) #define PRE_WK(i) ((i) * 4 * 2)(%rsp) #define R_F1(a,b,c,d,e,i,block) \ movl c, RT0; \ andn d, b, RT1; \ addl WK(i,block), e; \ andl b, RT0; \ leal (a,ne), a; \ rorxl $2, b, b; \ addl RT1, e; \ rorxl $27, a, ne; \ addl RT0, e; #define R_F2(a,b,c,d,e,i,block) \ addl WK(i,block), e; \ movl c, RT0; \ xorl b, RT0; \ leal (a,ne), a; \ rorxl $2, b, b; \ xorl d, RT0; \ addl RT0, e; \ rorxl $27, a, ne; #define R_F3(a,b,c,d,e,i,block) \ movl c, RT0; \ addl WK(i,block), e; \ movl b, RT1; \ xorl b, RT0; \ leal (a,ne), a; \ rorxl $2, b, b; \ andl c, RT1; \ addl RT1, e; \ andl d, RT0; \ rorxl $27, a, ne; \ addl RT0, e; #define R_F4(a,b,c,d,e,i,block) R_F2(a,b,c,d,e,i,block) #define R(a,b,c,d,e,f,i,block) \ R_##f(a,b,c,d,e,i,block) /* Input expansion macros. */ #define W_PRECALC_00_15_0(i, W, tmp0) \ vmovdqu (4*(i))(RDATA), tmp0##x; \ vinserti128 $1, (4*(i) + 64)(RDATA), tmp0, tmp0; #define W_PRECALC_00_15_1(i, W, tmp0) \ vpshufb BSWAP_REG, tmp0, W; #define W_PRECALC_00_15_2(i, W, tmp0, K) \ vpaddd K, W, tmp0; #define W_PRECALC_00_15_3(i, W, tmp0) \ vmovdqa tmp0, PRE_WK((i)&~3); #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpalignr $8, W_m16, W_m12, W; \ vpsrldq $4, W_m04, tmp0; \ vpxor W_m08, W, W; #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpxor W_m16, tmp0, tmp0; \ vpxor tmp0, W, W; \ vpslld $1, W, tmp0; \ vpslldq $12, W, tmp1; \ vpsrld $31, W, W; #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpor W, tmp0, tmp0; \ vpsrld $30, tmp1, W; \ vpslld $2, tmp1, tmp1; #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \ vpxor W, tmp0, tmp0; \ vpxor tmp1, tmp0, W; \ vpaddd K, W, tmp0; \ vmovdqa tmp0, PRE_WK((i)&~3); #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpxor W_m28, W, W; \ vpalignr $8, W_m08, W_m04, tmp0; #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpxor W_m16, W, W; \ vpxor tmp0, W, W; #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpsrld $30, W, tmp0; \ vpslld $2, W, W; #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \ vpor W, tmp0, W; \ vpaddd K, W, tmp0; \ vmovdqa tmp0, PRE_WK((i)&~3); /* * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA. * * unsigned int * _gcry_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data, * size_t nblks) */ .globl _gcry_sha1_transform_amd64_avx2_bmi2 ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2,@function) .align 16 _gcry_sha1_transform_amd64_avx2_bmi2: /* input: * %rdi: ctx, CTX * %rsi: data (64*nblks bytes) * %rdx: nblks (multiple of 2, larger than 0) */ CFI_STARTPROC(); vzeroupper; movq %rdx, RNBLKS; movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; CFI_PUSH(%rbx); pushq %rbp; CFI_PUSH(%rbp); pushq %r12; CFI_PUSH(%r12); movq %rsp, ROLDSTACK; CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(WK_STACK_WORDS*4), %rsp; andq $(~63), %rsp; /* Get the values of the chaining variables. */ movl state_h0(RSTATE), a; movl state_h1(RSTATE), b; movl state_h2(RSTATE), c; movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; xorl ne, ne; vbroadcasti128 .Lbswap_shufb_ctl rRIP, BSWAP_REG; vpbroadcastd .LK1 rRIP, K1; vpbroadcastd .LK2 rRIP, K2; vpbroadcastd .LK3 rRIP, K3; vpbroadcastd .LK4 rRIP, K4; /* Precalc 0-31 for block 1 & 2. */ W_PRECALC_00_15_0(0, W0, Wtmp0); W_PRECALC_00_15_1(1, W0, Wtmp0); W_PRECALC_00_15_2(2, W0, Wtmp0, K1); W_PRECALC_00_15_3(3, W0, Wtmp0); W_PRECALC_00_15_0(4, W7, Wtmp0); W_PRECALC_00_15_1(5, W7, Wtmp0); W_PRECALC_00_15_2(6, W7, Wtmp0, K1); W_PRECALC_00_15_3(7, W7, Wtmp0); W_PRECALC_00_15_0(8, W6, Wtmp0); W_PRECALC_00_15_1(9, W6, Wtmp0); W_PRECALC_00_15_2(10, W6, Wtmp0, K1); W_PRECALC_00_15_3(11, W6, Wtmp0); W_PRECALC_00_15_0(12, W5, Wtmp0); W_PRECALC_00_15_1(13, W5, Wtmp0); W_PRECALC_00_15_2(14, W5, Wtmp0, K1); W_PRECALC_00_15_3(15, W5, Wtmp0); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2); .align 8 .Loop: addq $(2 * 64), RDATA; /* Transform 0-15 for block 1 + Precalc 32-47 for block 1 & 2. */ R( a, b, c, d, e, F1, 0, 0 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( e, a, b, c, d, F1, 1, 0 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( d, e, a, b, c, F1, 2, 0 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( c, d, e, a, b, F1, 3, 0 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2); R( b, c, d, e, a, F1, 4, 0 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( a, b, c, d, e, F1, 5, 0 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( e, a, b, c, d, F1, 6, 0 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( d, e, a, b, c, F1, 7, 0 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2); R( c, d, e, a, b, F1, 8, 0 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( b, c, d, e, a, F1, 9, 0 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( a, b, c, d, e, F1, 10, 0 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( e, a, b, c, d, F1, 11, 0 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3); R( d, e, a, b, c, F1, 12, 0 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( c, d, e, a, b, F1, 13, 0 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( b, c, d, e, a, F1, 14, 0 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( a, b, c, d, e, F1, 15, 0 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3); /* Transform 16-47 for block 1 + Precalc 48-79 for block 1 & 2. */ R( e, a, b, c, d, F1, 16, 0 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( d, e, a, b, c, F1, 17, 0 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( c, d, e, a, b, F1, 18, 0 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( b, c, d, e, a, F1, 19, 0 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3); R( a, b, c, d, e, F2, 20, 0 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( e, a, b, c, d, F2, 21, 0 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( d, e, a, b, c, F2, 22, 0 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( c, d, e, a, b, F2, 23, 0 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3); R( b, c, d, e, a, F2, 24, 0 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( a, b, c, d, e, F2, 25, 0 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( e, a, b, c, d, F2, 26, 0 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( d, e, a, b, c, F2, 27, 0 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3); R( c, d, e, a, b, F2, 28, 0 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( b, c, d, e, a, F2, 29, 0 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( a, b, c, d, e, F2, 30, 0 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( e, a, b, c, d, F2, 31, 0 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4); R( d, e, a, b, c, F2, 32, 0 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( c, d, e, a, b, F2, 33, 0 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F2, 34, 0 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( a, b, c, d, e, F2, 35, 0 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4); R( e, a, b, c, d, F2, 36, 0 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( d, e, a, b, c, F2, 37, 0 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F2, 38, 0 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( b, c, d, e, a, F2, 39, 0 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4); R( a, b, c, d, e, F3, 40, 0 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( e, a, b, c, d, F3, 41, 0 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F3, 42, 0 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( c, d, e, a, b, F3, 43, 0 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4); R( b, c, d, e, a, F3, 44, 0 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( a, b, c, d, e, F3, 45, 0 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F3, 46, 0 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( d, e, a, b, c, F3, 47, 0 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4); /* Transform 48-79 for block 1. */ R( c, d, e, a, b, F3, 48, 0 ); R( b, c, d, e, a, F3, 49, 0 ); R( a, b, c, d, e, F3, 50, 0 ); R( e, a, b, c, d, F3, 51, 0 ); R( d, e, a, b, c, F3, 52, 0 ); R( c, d, e, a, b, F3, 53, 0 ); R( b, c, d, e, a, F3, 54, 0 ); R( a, b, c, d, e, F3, 55, 0 ); R( e, a, b, c, d, F3, 56, 0 ); R( d, e, a, b, c, F3, 57, 0 ); R( c, d, e, a, b, F3, 58, 0 ); R( b, c, d, e, a, F3, 59, 0 ); R( a, b, c, d, e, F4, 60, 0 ); R( e, a, b, c, d, F4, 61, 0 ); R( d, e, a, b, c, F4, 62, 0 ); R( c, d, e, a, b, F4, 63, 0 ); R( b, c, d, e, a, F4, 64, 0 ); R( a, b, c, d, e, F4, 65, 0 ); R( e, a, b, c, d, F4, 66, 0 ); R( d, e, a, b, c, F4, 67, 0 ); R( c, d, e, a, b, F4, 68, 0 ); R( b, c, d, e, a, F4, 69, 0 ); R( a, b, c, d, e, F4, 70, 0 ); R( e, a, b, c, d, F4, 71, 0 ); R( d, e, a, b, c, F4, 72, 0 ); R( c, d, e, a, b, F4, 73, 0 ); R( b, c, d, e, a, F4, 74, 0 ); R( a, b, c, d, e, F4, 75, 0 ); R( e, a, b, c, d, F4, 76, 0 ); R( d, e, a, b, c, F4, 77, 0 ); R( c, d, e, a, b, F4, 78, 0 ); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79, 0 ); addl ne, a; xorl ne, ne; /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); /* Transform 0-47 for block 2. */ R( a, b, c, d, e, F1, 0, 1 ); R( e, a, b, c, d, F1, 1, 1 ); R( d, e, a, b, c, F1, 2, 1 ); R( c, d, e, a, b, F1, 3, 1 ); R( b, c, d, e, a, F1, 4, 1 ); R( a, b, c, d, e, F1, 5, 1 ); R( e, a, b, c, d, F1, 6, 1 ); R( d, e, a, b, c, F1, 7, 1 ); R( c, d, e, a, b, F1, 8, 1 ); R( b, c, d, e, a, F1, 9, 1 ); R( a, b, c, d, e, F1, 10, 1 ); R( e, a, b, c, d, F1, 11, 1 ); R( d, e, a, b, c, F1, 12, 1 ); R( c, d, e, a, b, F1, 13, 1 ); R( b, c, d, e, a, F1, 14, 1 ); R( a, b, c, d, e, F1, 15, 1 ); R( e, a, b, c, d, F1, 16, 1 ); R( d, e, a, b, c, F1, 17, 1 ); R( c, d, e, a, b, F1, 18, 1 ); R( b, c, d, e, a, F1, 19, 1 ); R( a, b, c, d, e, F2, 20, 1 ); R( e, a, b, c, d, F2, 21, 1 ); R( d, e, a, b, c, F2, 22, 1 ); R( c, d, e, a, b, F2, 23, 1 ); R( b, c, d, e, a, F2, 24, 1 ); R( a, b, c, d, e, F2, 25, 1 ); R( e, a, b, c, d, F2, 26, 1 ); R( d, e, a, b, c, F2, 27, 1 ); R( c, d, e, a, b, F2, 28, 1 ); R( b, c, d, e, a, F2, 29, 1 ); R( a, b, c, d, e, F2, 30, 1 ); R( e, a, b, c, d, F2, 31, 1 ); R( d, e, a, b, c, F2, 32, 1 ); R( c, d, e, a, b, F2, 33, 1 ); R( b, c, d, e, a, F2, 34, 1 ); R( a, b, c, d, e, F2, 35, 1 ); R( e, a, b, c, d, F2, 36, 1 ); R( d, e, a, b, c, F2, 37, 1 ); R( c, d, e, a, b, F2, 38, 1 ); R( b, c, d, e, a, F2, 39, 1 ); R( a, b, c, d, e, F3, 40, 1 ); R( e, a, b, c, d, F3, 41, 1 ); R( d, e, a, b, c, F3, 42, 1 ); R( c, d, e, a, b, F3, 43, 1 ); R( b, c, d, e, a, F3, 44, 1 ); R( a, b, c, d, e, F3, 45, 1 ); R( e, a, b, c, d, F3, 46, 1 ); R( d, e, a, b, c, F3, 47, 1 ); addq $-2, RNBLKS; jz .Lend; /* Transform 48-79 for block 2 + Precalc 0-31 for next two blocks. */ R( c, d, e, a, b, F3, 48, 1 ); W_PRECALC_00_15_0(0, W0, Wtmp0); R( b, c, d, e, a, F3, 49, 1 ); W_PRECALC_00_15_1(1, W0, Wtmp0); R( a, b, c, d, e, F3, 50, 1 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1); R( e, a, b, c, d, F3, 51, 1 ); W_PRECALC_00_15_3(3, W0, Wtmp0); R( d, e, a, b, c, F3, 52, 1 ); W_PRECALC_00_15_0(4, W7, Wtmp0); R( c, d, e, a, b, F3, 53, 1 ); W_PRECALC_00_15_1(5, W7, Wtmp0); R( b, c, d, e, a, F3, 54, 1 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1); R( a, b, c, d, e, F3, 55, 1 ); W_PRECALC_00_15_3(7, W7, Wtmp0); R( e, a, b, c, d, F3, 56, 1 ); W_PRECALC_00_15_0(8, W6, Wtmp0); R( d, e, a, b, c, F3, 57, 1 ); W_PRECALC_00_15_1(9, W6, Wtmp0); R( c, d, e, a, b, F3, 58, 1 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1); R( b, c, d, e, a, F3, 59, 1 ); W_PRECALC_00_15_3(11, W6, Wtmp0); R( a, b, c, d, e, F4, 60, 1 ); W_PRECALC_00_15_0(12, W5, Wtmp0); R( e, a, b, c, d, F4, 61, 1 ); W_PRECALC_00_15_1(13, W5, Wtmp0); R( d, e, a, b, c, F4, 62, 1 ); W_PRECALC_00_15_2(14, W5, Wtmp0, K1); R( c, d, e, a, b, F4, 63, 1 ); W_PRECALC_00_15_3(15, W5, Wtmp0); R( b, c, d, e, a, F4, 64, 1 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( a, b, c, d, e, F4, 65, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( e, a, b, c, d, F4, 66, 1 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( d, e, a, b, c, F4, 67, 1 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1); R( c, d, e, a, b, F4, 68, 1 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( b, c, d, e, a, F4, 69, 1 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( a, b, c, d, e, F4, 70, 1 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( e, a, b, c, d, F4, 71, 1 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2); R( d, e, a, b, c, F4, 72, 1 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( c, d, e, a, b, F4, 73, 1 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( b, c, d, e, a, F4, 74, 1 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( a, b, c, d, e, F4, 75, 1 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2); R( e, a, b, c, d, F4, 76, 1 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( d, e, a, b, c, F4, 77, 1 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( c, d, e, a, b, F4, 78, 1 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); addl state_h0(RSTATE), a; W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2); R( b, c, d, e, a, F4, 79, 1 ); addl ne, a; xorl ne, ne; /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); jmp .Loop; .align 16 .Lend: vzeroall; /* Transform 48-79 for block 2 + burn stack */ R( c, d, e, a, b, F3, 48, 1 ); R( b, c, d, e, a, F3, 49, 1 ); R( a, b, c, d, e, F3, 50, 1 ); R( e, a, b, c, d, F3, 51, 1 ); R( d, e, a, b, c, F3, 52, 1 ); R( c, d, e, a, b, F3, 53, 1 ); R( b, c, d, e, a, F3, 54, 1 ); R( a, b, c, d, e, F3, 55, 1 ); R( e, a, b, c, d, F3, 56, 1 ); R( d, e, a, b, c, F3, 57, 1 ); R( c, d, e, a, b, F3, 58, 1 ); R( b, c, d, e, a, F3, 59, 1 ); R( a, b, c, d, e, F4, 60, 1 ); vmovdqa %ymm0, (0*32)(%rsp); R( e, a, b, c, d, F4, 61, 1 ); vmovdqa %ymm0, (1*32)(%rsp); R( d, e, a, b, c, F4, 62, 1 ); vmovdqa %ymm0, (2*32)(%rsp); R( c, d, e, a, b, F4, 63, 1 ); vmovdqa %ymm0, (3*32)(%rsp); R( b, c, d, e, a, F4, 64, 1 ); vmovdqa %ymm0, (4*32)(%rsp); R( a, b, c, d, e, F4, 65, 1 ); vmovdqa %ymm0, (5*32)(%rsp); R( e, a, b, c, d, F4, 66, 1 ); vmovdqa %ymm0, (6*32)(%rsp); R( d, e, a, b, c, F4, 67, 1 ); vmovdqa %ymm0, (7*32)(%rsp); R( c, d, e, a, b, F4, 68, 1 ); vmovdqa %ymm0, (8*32)(%rsp); R( b, c, d, e, a, F4, 69, 1 ); vmovdqa %ymm0, (9*32)(%rsp); R( a, b, c, d, e, F4, 70, 1 ); vmovdqa %ymm0, (10*32)(%rsp); R( e, a, b, c, d, F4, 71, 1 ); vmovdqa %ymm0, (11*32)(%rsp); R( d, e, a, b, c, F4, 72, 1 ); vmovdqa %ymm0, (12*32)(%rsp); R( c, d, e, a, b, F4, 73, 1 ); vmovdqa %ymm0, (13*32)(%rsp); R( b, c, d, e, a, F4, 74, 1 ); vmovdqa %ymm0, (14*32)(%rsp); R( a, b, c, d, e, F4, 75, 1 ); vmovdqa %ymm0, (15*32)(%rsp); R( e, a, b, c, d, F4, 76, 1 ); vmovdqa %ymm0, (16*32)(%rsp); R( d, e, a, b, c, F4, 77, 1 ); vmovdqa %ymm0, (17*32)(%rsp); R( c, d, e, a, b, F4, 78, 1 ); vmovdqa %ymm0, (18*32)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79, 1 ); addl ne, a; xorl ne, ne; /* WK_STACK_WORDS*4/32-1 = 19 */ vmovdqa %ymm0, (19*32)(%rsp); /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; CFI_REGISTER(ROLDSTACK, %rsp); CFI_DEF_CFA_REGISTER(%rsp); popq %r12; CFI_POP(%r12); popq %rbp; CFI_POP(%rbp); popq %rbx; CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2, .-_gcry_sha1_transform_amd64_avx2_bmi2;) #endif #endif diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index db62928a..f09b1de1 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -1,437 +1,437 @@ /* sha1-ssse3-amd64.S - Intel SSSE3 accelerated SHA-1 transform function * Copyright (C) 2013 Jussi Kivilinna * * Based on sha1.c: * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Intel SSSE3 accelerated SHA-1 implementation based on white paper: * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 */ #ifdef __x86_64__ #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) #include "asm-common-amd64.h" /* Context structure */ #define state_h0 0 #define state_h1 4 #define state_h2 8 #define state_h3 12 #define state_h4 16 /* Constants */ .text #define K1 0x5A827999 #define K2 0x6ED9EBA1 #define K3 0x8F1BBCDC #define K4 0xCA62C1D6 .align 16 .LK_XMM: .LK1: .long K1, K1, K1, K1 .LK2: .long K2, K2, K2, K2 .LK3: .long K3, K3, K3, K3 .LK4: .long K4, K4, K4, K4 .Lbswap_shufb_ctl: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f /* Register macros */ #define RSTATE %r8 #define RDATA %r9 #define ROLDSTACK %r10 #define RNBLKS %r11 #define a %eax #define b %ebx #define c %ecx #define d %edx #define e %edi #define RT0 %esi #define RT1 %ebp #define Wtmp0 %xmm0 #define Wtmp1 %xmm1 #define W0 %xmm2 #define W1 %xmm3 #define W2 %xmm4 #define W3 %xmm5 #define W4 %xmm6 #define W5 %xmm7 #define W6 %xmm8 #define W7 %xmm9 #define BSWAP_REG %xmm10 /* Round function macros. */ #define WK(i) (((i) & 15) * 4)(%rsp) #define R_F1(a,b,c,d,e,i) \ movl c, RT0; \ addl WK(i), e; \ xorl d, RT0; \ movl a, RT1; \ andl b, RT0; \ roll $30, b; \ xorl d, RT0; \ leal (RT0,e), e; \ roll $5, RT1; \ addl RT1, e; #define R_F2(a,b,c,d,e,i) \ movl c, RT0; \ addl WK(i), e; \ xorl b, RT0; \ roll $30, b; \ xorl d, RT0; \ movl a, RT1; \ leal (RT0,e), e; \ roll $5, RT1; \ addl RT1, e; #define R_F3(a,b,c,d,e,i) \ movl c, RT0; \ movl b, RT1; \ xorl b, RT0; \ andl c, RT1; \ andl d, RT0; \ addl RT1, e; \ addl WK(i), e; \ roll $30, b; \ movl a, RT1; \ leal (RT0,e), e; \ roll $5, RT1; \ addl RT1, e; #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) #define R(a,b,c,d,e,f,i) \ R_##f(a,b,c,d,e,i) /* Input expansion macros. */ #define W_PRECALC_00_15_0(i, W, tmp0) \ movdqu (4*(i))(RDATA), tmp0; #define W_PRECALC_00_15_1(i, W, tmp0) \ pshufb BSWAP_REG, tmp0; \ movdqa tmp0, W; #define W_PRECALC_00_15_2(i, W, tmp0) \ paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; #define W_PRECALC_00_15_3(i, W, tmp0) \ movdqa tmp0, WK(i&~3); #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ movdqa W_m12, W; \ palignr $8, W_m16, W; \ movdqa W_m04, tmp0; \ psrldq $4, tmp0; \ pxor W_m08, W; #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ pxor W_m16, tmp0; \ pxor tmp0, W; \ movdqa W, tmp1; \ movdqa W, tmp0; \ pslldq $12, tmp1; #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ psrld $31, W; \ pslld $1, tmp0; \ por W, tmp0; \ movdqa tmp1, W; \ psrld $30, tmp1; \ pslld $2, W; #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ pxor W, tmp0; \ pxor tmp1, tmp0; \ movdqa tmp0, W; \ paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \ movdqa tmp0, WK((i)&~3); #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ movdqa W_m04, tmp0; \ pxor W_m28, W; \ palignr $8, W_m08, tmp0; #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ pxor W_m16, W; \ pxor tmp0, W; \ movdqa W, tmp0; #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ psrld $30, W; \ pslld $2, tmp0; \ por W, tmp0; #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ movdqa tmp0, W; \ paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \ movdqa tmp0, WK((i)&~3); #define CLEAR_REG(reg) pxor reg, reg; /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * * unsigned int * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data, * size_t nblks) */ .globl _gcry_sha1_transform_amd64_ssse3 ELF(.type _gcry_sha1_transform_amd64_ssse3,@function) .align 16 _gcry_sha1_transform_amd64_ssse3: /* input: * %rdi: ctx, CTX * %rsi: data (64*nblks bytes) * %rdx: nblks */ CFI_STARTPROC(); xorl %eax, %eax; cmpq $0, %rdx; jz .Lret; movq %rdx, RNBLKS; movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; CFI_PUSH(%rbx); pushq %rbp; CFI_PUSH(%rbp); movq %rsp, ROLDSTACK; CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(16*4), %rsp; andq $(~31), %rsp; /* Get the values of the chaining variables. */ movl state_h0(RSTATE), a; movl state_h1(RSTATE), b; movl state_h2(RSTATE), c; movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; movdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG; /* Precalc 0-15. */ W_PRECALC_00_15_0(0, W0, Wtmp0); W_PRECALC_00_15_1(1, W0, Wtmp0); W_PRECALC_00_15_2(2, W0, Wtmp0); W_PRECALC_00_15_3(3, W0, Wtmp0); W_PRECALC_00_15_0(4, W7, Wtmp0); W_PRECALC_00_15_1(5, W7, Wtmp0); W_PRECALC_00_15_2(6, W7, Wtmp0); W_PRECALC_00_15_3(7, W7, Wtmp0); W_PRECALC_00_15_0(8, W6, Wtmp0); W_PRECALC_00_15_1(9, W6, Wtmp0); W_PRECALC_00_15_2(10, W6, Wtmp0); W_PRECALC_00_15_3(11, W6, Wtmp0); W_PRECALC_00_15_0(12, W5, Wtmp0); W_PRECALC_00_15_1(13, W5, Wtmp0); W_PRECALC_00_15_2(14, W5, Wtmp0); W_PRECALC_00_15_3(15, W5, Wtmp0); .align 8 .Loop: addq $64, RDATA; /* Transform 0-15 + Precalc 16-31. */ R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); /* Transform 16-63 + Precalc 32-79. */ R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); decq RNBLKS; jz .Lend; /* Transform 64-79 + Precalc 0-15 of next block. */ R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0); R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0); R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0); R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); R( c, d, e, a, b, F4, 78 ); addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0); R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); jmp .Loop; .align 16 .Lend: /* Transform 64-79 + Clear XMM registers + Burn stack. */ R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG); R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0); R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1); R( d, e, a, b, c, F4, 67 ); CLEAR_REG(W0); R( c, d, e, a, b, F4, 68 ); CLEAR_REG(W1); R( b, c, d, e, a, F4, 69 ); CLEAR_REG(W2); R( a, b, c, d, e, F4, 70 ); CLEAR_REG(W3); R( e, a, b, c, d, F4, 71 ); CLEAR_REG(W4); R( d, e, a, b, c, F4, 72 ); CLEAR_REG(W5); R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6); R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7); R( a, b, c, d, e, F4, 75 ); R( e, a, b, c, d, F4, 76 ); movdqa Wtmp0, (0*16)(%rsp); R( d, e, a, b, c, F4, 77 ); movdqa Wtmp0, (1*16)(%rsp); R( c, d, e, a, b, F4, 78 ); movdqa Wtmp0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); /* 16*4/16-1 = 3 */ movdqa Wtmp0, (3*16)(%rsp); /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; movl d, state_h3(RSTATE); movl c, state_h2(RSTATE); movl b, state_h1(RSTATE); movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; CFI_REGISTER(ROLDSTACK, %rsp); CFI_DEF_CFA_REGISTER(%rsp); popq %rbp; CFI_POP(%rbp); popq %rbx; CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; .Lret: - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_ssse3, .-_gcry_sha1_transform_amd64_ssse3;) #endif #endif diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S index ec945f84..be8a799d 100644 --- a/cipher/sha256-avx-amd64.S +++ b/cipher/sha256-avx-amd64.S @@ -1,506 +1,506 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; This code is described in an Intel White-Paper: ; "Fast SHA-256 Implementations on Intel Architecture Processors" ; ; To find it, surf to http://www.intel.com/p/en_US/embedded ; and search for that title. ; The paper is expected to be released roughly at the end of April, 2012 ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna * * Note: Based on the SSSE3 implementation. */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256) #include "asm-common-amd64.h" .intel_syntax noprefix #define VMOVDQ vmovdqu /* assume buffers not aligned */ #define ROR(p1, p2) \ /* shld is faster than ror on Intel Sandybridge */ \ shld p1, p1, (32 - p2); /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ /* addm [mem], reg * Add reg to mem using reg-mem add and store */ #define addm(p1, p2) \ add p2, p1; \ mov p1, p2; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask * Load xmm with mem and byte swap each dword */ #define COPY_XMM_AND_BSWAP(p1, p2, p3) \ VMOVDQ p1, p2; \ vpshufb p1, p1, p3; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ #define X0 xmm4 #define X1 xmm5 #define X2 xmm6 #define X3 xmm7 #define XTMP0 xmm0 #define XTMP1 xmm1 #define XTMP2 xmm2 #define XTMP3 xmm3 #define XTMP4 xmm8 #define XFER xmm9 #define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ #define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ #define BYTE_FLIP_MASK xmm12 #define NUM_BLKS rdx /* 3rd arg */ #define CTX rsi /* 2nd arg */ #define INP rdi /* 1st arg */ #define SRND rdi /* clobbers INP */ #define c ecx #define d r8d #define e edx #define TBL rbp #define a eax #define b ebx #define f r9d #define g r10d #define h r11d #define y0 r13d #define y1 r14d #define y2 r15d #define _INP_END_SIZE 8 #define _INP_SIZE 8 #define _XFER_SIZE 8 #define _XMM_SAVE_SIZE 0 /* STACK_SIZE plus pushes must be an odd multiple of 8 */ #define _ALIGN_SIZE 8 #define _INP_END 0 #define _INP (_INP_END + _INP_END_SIZE) #define _XFER (_INP + _INP_SIZE) #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) #define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ /* compute s0 four at a time and s1 two at a time */; \ /* compute W[-16] + W[-7] 4 at a time */; \ mov y0, e /* y0 = e */; \ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ mov y1, a /* y1 = a */; \ vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ mov y2, f /* y2 = f */; \ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ xor y2, g /* y2 = f^g */; \ vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ /* compute s0 */; \ vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, y0 /* y2 = S1 + CH */; \ add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ vpslld XTMP2, XTMP1, (32-7); \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ vpsrld XTMP3, XTMP1, 7; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ mov y0, e /* y0 = e */; \ mov y1, a /* y1 = a */; \ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ mov y2, f /* y2 = f */; \ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ vpslld XTMP2, XTMP1, (32-18); \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ xor y2, g /* y2 = f^g */; \ vpsrld XTMP4, XTMP1, 18; \ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ vpxor XTMP4, XTMP4, XTMP3; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \ add y2, y0 /* y2 = S1 + CH */; \ add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ /* compute low s1 */; \ vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ mov y0, e /* y0 = e */; \ mov y1, a /* y1 = a */; \ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ mov y2, f /* y2 = f */; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ xor y2, g /* y2 = f^g */; \ vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ vpxor XTMP2, XTMP2, XTMP3; \ add y2, y0 /* y2 = S1 + CH */; \ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ /* compute high s1 */; \ vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ mov y0, e /* y0 = e */; \ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ mov y1, a /* y1 = a */; \ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ mov y2, f /* y2 = f */; \ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ xor y2, g /* y2 = f^g */; \ vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ vpsrld XTMP2, XTMP2, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ vpxor XTMP2, XTMP2, XTMP3; \ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, y0 /* y2 = S1 + CH */; \ add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); /* input is [rsp + _XFER + %1 * 4] */ #define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ mov y0, e /* y0 = e */; \ ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \ mov y1, a /* y1 = a */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \ mov y2, f /* y2 = f */; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ xor y2, g /* y2 = f^g */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ and y2, e /* y2 = (f^g)&e */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ add y2, y0 /* y2 = S1 + CH */; \ ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest ;; arg 3 : Num blocks */ .text .globl _gcry_sha256_transform_amd64_avx ELF(.type _gcry_sha256_transform_amd64_avx,@function;) .align 16 _gcry_sha256_transform_amd64_avx: CFI_STARTPROC() vzeroupper push rbx CFI_PUSH(rbx) push rbp CFI_PUSH(rbp) push r13 CFI_PUSH(r13) push r14 CFI_PUSH(r14) push r15 CFI_PUSH(r15) sub rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(STACK_SIZE); shl NUM_BLKS, 6 /* convert to bytes */ jz .Ldone_hash add NUM_BLKS, INP /* pointer to end of data */ mov [rsp + _INP_END], NUM_BLKS /* load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] .Loop0: lea TBL, [.LK256 ADD_RIP] /* byte swap first 16 dwords */ COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) mov [rsp + _INP], INP /* schedule 48 input dwords, by doing 3 rounds of 16 each */ mov SRND, 3 .align 16 .Loop1: vpaddd XFER, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) vpaddd XFER, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) vpaddd XFER, X2, [TBL + 2*16] vmovdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) vpaddd XFER, X3, [TBL + 3*16] vmovdqa [rsp + _XFER], XFER add TBL, 4*16 FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) sub SRND, 1 jne .Loop1 mov SRND, 2 .Loop2: vpaddd X0, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], X0 DO_ROUND(0, a, b, c, d, e, f, g, h) DO_ROUND(1, h, a, b, c, d, e, f, g) DO_ROUND(2, g, h, a, b, c, d, e, f) DO_ROUND(3, f, g, h, a, b, c, d, e) vpaddd X1, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], X1 add TBL, 2*16 DO_ROUND(0, e, f, g, h, a, b, c, d) DO_ROUND(1, d, e, f, g, h, a, b, c) DO_ROUND(2, c, d, e, f, g, h, a, b) DO_ROUND(3, b, c, d, e, f, g, h, a) vmovdqa X0, X2 vmovdqa X1, X3 sub SRND, 1 jne .Loop2 addm([4*0 + CTX],a) addm([4*1 + CTX],b) addm([4*2 + CTX],c) addm([4*3 + CTX],d) addm([4*4 + CTX],e) addm([4*5 + CTX],f) addm([4*6 + CTX],g) addm([4*7 + CTX],h) mov INP, [rsp + _INP] add INP, 64 cmp INP, [rsp + _INP_END] jne .Loop0 .Ldone_hash: vzeroall vmovdqa [rsp + _XFER], XFER xor eax, eax add rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(-STACK_SIZE); pop r15 CFI_POP(r15) pop r14 CFI_POP(r14) pop r13 CFI_POP(r13) pop rbp CFI_POP(rbp) pop rbx CFI_POP(rbx) - ret + ret_spec_stop CFI_ENDPROC() .align 16 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 /* shuffle xBxA -> 00BA */ .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 /* shuffle xDxC -> DC00 */ .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index d130dd4a..60ad442c 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -1,527 +1,527 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; This code is described in an Intel White-Paper: ; "Fast SHA-256 Implementations on Intel Architecture Processors" ; ; To find it, surf to http://www.intel.com/p/en_US/embedded ; and search for that title. ; The paper is expected to be released roughly at the end of April, 2012 ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 2 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(USE_SHA256) #include "asm-common-amd64.h" .intel_syntax noprefix #define VMOVDQ vmovdqu /* ; assume buffers not aligned */ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros */ /* addm [mem], reg */ /* Add reg to mem using reg-mem add and store */ #define addm(p1, p2) \ add p2, p1; \ mov p1, p2; /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ #define X0 ymm4 #define X1 ymm5 #define X2 ymm6 #define X3 ymm7 /* XMM versions of above */ #define XWORD0 xmm4 #define XWORD1 xmm5 #define XWORD2 xmm6 #define XWORD3 xmm7 #define XTMP0 ymm0 #define XTMP1 ymm1 #define XTMP2 ymm2 #define XTMP3 ymm3 #define XTMP4 ymm8 #define XFER ymm9 #define XTMP5 ymm11 #define SHUF_00BA ymm10 /* shuffle xBxA -> 00BA */ #define SHUF_DC00 ymm12 /* shuffle xDxC -> DC00 */ #define BYTE_FLIP_MASK ymm13 #define X_BYTE_FLIP_MASK xmm13 /* XMM version of BYTE_FLIP_MASK */ #define NUM_BLKS rdx /* 3rd arg */ #define CTX rsi /* 2nd arg */ #define INP rdi /* 1st arg */ #define c ecx #define d r8d #define e edx /* clobbers NUM_BLKS */ #define y3 edi /* clobbers INP */ #define TBL rbp #define SRND CTX /* SRND is same register as CTX */ #define a eax #define b ebx #define f r9d #define g r10d #define h r11d #define old_h r11d #define T1 r12d #define y0 r13d #define y1 r14d #define y2 r15d #define _XFER_SIZE 2*64*4 /* 2 blocks, 64 rounds, 4 bytes/round */ #define _XMM_SAVE_SIZE 0 #define _INP_END_SIZE 8 #define _INP_SIZE 8 #define _CTX_SIZE 8 #define _RSP_SIZE 8 #define _XFER 0 #define _XMM_SAVE _XFER + _XFER_SIZE #define _INP_END _XMM_SAVE + _XMM_SAVE_SIZE #define _INP _INP_END + _INP_END_SIZE #define _CTX _INP + _INP_SIZE #define _RSP _CTX + _CTX_SIZE #define STACK_SIZE _RSP + _RSP_SIZE #define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \ /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); */ \ /* d += h; */ \ /* h += Sum0 (a) + Maj (a, b, c); */ \ \ /* Ch(x, y, z) => ((x & y) + (~x & z)) */ \ /* Maj(x, y, z) => ((x & y) + (z & (x ^ y))) */ \ \ mov y3, e; \ add h, [XFERIN]; \ and y3, f; \ rorx y0, e, 25; \ rorx y1, e, 11; \ lea h, [h + y3]; \ andn y3, e, g; \ rorx T1, a, 13; \ xor y0, y1; \ lea h, [h + y3] #define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \ rorx y2, a, 22; \ rorx y1, e, 6; \ mov y3, a; \ xor T1, y2; \ xor y0, y1; \ xor y3, b; \ lea h, [h + y0]; \ mov y0, a; \ rorx y2, a, 2; \ add d, h; \ and y3, c; \ xor T1, y2; \ lea h, [h + y3]; \ lea h, [h + T1]; \ and y0, b; \ lea h, [h + y0] #define ONE_ROUND(XFER, a, b, c, d, e, f, g, h) \ ONE_ROUND_PART1(XFER, a, b, c, d, e, f, g, h); \ ONE_ROUND_PART2(a, b, c, d, e, f, g, h) #define FOUR_ROUNDS_AND_SCHED(XFERIN, XFEROUT, X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \ vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */; \ vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \ vpsrld XTMP2, XTMP1, 7; \ vpslld XTMP3, XTMP1, (32-7); \ vpor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 */; \ vpsrld XTMP2, XTMP1,18; \ \ ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \ \ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vpsrld XTMP4, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \ vpslld XTMP1, XTMP1, (32-18); \ vpxor XTMP3, XTMP3, XTMP1; \ vpxor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */; \ vpxor XTMP1, XTMP3, XTMP4 /* XTMP1 = s0 */; \ vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ vpsrld XTMP4, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ \ ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \ \ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ vpxor XTMP2, XTMP2, XTMP3; \ vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ vpshufd XTMP2, XTMP0, 0b1010000 /* XTMP2 = W[-2] {DDCC} */; \ \ ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \ \ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vpsrld XTMP5, XTMP2, 10 /* XTMP5 = W[-2] >> 10 {DDCC} */; \ vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ vpxor XTMP2, XTMP2, XTMP3; \ vpxor XTMP5, XTMP5, XTMP2 /* XTMP5 = s1 {xDxC} */; \ vpshufb XTMP5, XTMP5, SHUF_DC00 /* XTMP5 = s1 {DC00} */; \ vpaddd X0, XTMP5, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ vpaddd XFER, X0, [TBL + XFEROUT]; \ \ ONE_ROUND_PART1(3*4+XFERIN, f, g, h, a, b, c, d, e); \ vmovdqa [rsp + _XFER + XFEROUT], XFER; \ ONE_ROUND_PART2(f, g, h, a, b, c, d, e); #define DO_4ROUNDS(XFERIN, a, b, c, d, e, f, g, h) \ ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \ ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \ ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \ ONE_ROUND(3*4+XFERIN, f, g, h, a, b, c, d, e) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest ;; arg 3 : Num blocks */ .text .globl _gcry_sha256_transform_amd64_avx2 ELF(.type _gcry_sha256_transform_amd64_avx2,@function) .align 32 _gcry_sha256_transform_amd64_avx2: CFI_STARTPROC() xor eax, eax cmp rdx, 0 je .Lnowork push rbx CFI_PUSH(rbx) push rbp CFI_PUSH(rbp) push r12 CFI_PUSH(r12) push r13 CFI_PUSH(r13) push r14 CFI_PUSH(r14) push r15 CFI_PUSH(r15) vzeroupper vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] mov rax, rsp CFI_DEF_CFA_REGISTER(rax); sub rsp, STACK_SIZE and rsp, ~63 mov [rsp + _RSP], rax CFI_CFA_ON_STACK(_RSP, 6 * 8) shl NUM_BLKS, 6 /* convert to bytes */ lea NUM_BLKS, [NUM_BLKS + INP - 64] /* pointer to last block */ mov [rsp + _INP_END], NUM_BLKS /* Check if only one block of input. Note: Loading initial digest * only uses 'mov' instruction and does not change condition * flags. */ cmp NUM_BLKS, INP /* ; load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] mov [rsp + _CTX], CTX je .Ldo_last_block .Loop0: lea TBL, [.LK256 ADD_RIP] /* ; Load first 16 dwords from two blocks */ VMOVDQ XTMP0, [INP + 0*32] VMOVDQ XTMP1, [INP + 1*32] VMOVDQ XTMP2, [INP + 2*32] VMOVDQ XTMP3, [INP + 3*32] /* ; byte swap data */ vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK /* ; transpose data into high/low halves */ vperm2i128 X0, XTMP0, XTMP2, 0x20 vperm2i128 X1, XTMP0, XTMP2, 0x31 vperm2i128 X2, XTMP1, XTMP3, 0x20 vperm2i128 X3, XTMP1, XTMP3, 0x31 .Last_block_enter: add INP, 64 mov [rsp + _INP], INP /* ; schedule 48 input dwords, by doing 3 rounds of 12 each */ xor SRND, SRND vpaddd XFER, X0, [TBL + 0*32] vmovdqa [rsp + _XFER + 0*32], XFER vpaddd XFER, X1, [TBL + 1*32] vmovdqa [rsp + _XFER + 1*32], XFER vpaddd XFER, X2, [TBL + 2*32] vmovdqa [rsp + _XFER + 2*32], XFER vpaddd XFER, X3, [TBL + 3*32] vmovdqa [rsp + _XFER + 3*32], XFER .align 16 .Loop1: FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 0*32, SRND + 4*32, X0, X1, X2, X3, a, b, c, d, e, f, g, h) FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 1*32, SRND + 5*32, X1, X2, X3, X0, e, f, g, h, a, b, c, d) FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 2*32, SRND + 6*32, X2, X3, X0, X1, a, b, c, d, e, f, g, h) FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 3*32, SRND + 7*32, X3, X0, X1, X2, e, f, g, h, a, b, c, d) add SRND, 4*32 cmp SRND, 3 * 4*32 jb .Loop1 /* ; Do last 16 rounds with no scheduling */ DO_4ROUNDS(rsp + _XFER + (3*4*32 + 0*32), a, b, c, d, e, f, g, h) DO_4ROUNDS(rsp + _XFER + (3*4*32 + 1*32), e, f, g, h, a, b, c, d) DO_4ROUNDS(rsp + _XFER + (3*4*32 + 2*32), a, b, c, d, e, f, g, h) DO_4ROUNDS(rsp + _XFER + (3*4*32 + 3*32), e, f, g, h, a, b, c, d) mov CTX, [rsp + _CTX] mov INP, [rsp + _INP] addm([4*0 + CTX],a) addm([4*1 + CTX],b) addm([4*2 + CTX],c) addm([4*3 + CTX],d) addm([4*4 + CTX],e) addm([4*5 + CTX],f) addm([4*6 + CTX],g) addm([4*7 + CTX],h) cmp INP, [rsp + _INP_END] ja .Ldone_hash /* ;;; Do second block using previously scheduled results */ xor SRND, SRND .align 16 .Loop3: DO_4ROUNDS(rsp + _XFER + SRND + 0*32 + 16, a, b, c, d, e, f, g, h) DO_4ROUNDS(rsp + _XFER + SRND + 1*32 + 16, e, f, g, h, a, b, c, d) add SRND, 2*32 cmp SRND, 4 * 4*32 jb .Loop3 mov CTX, [rsp + _CTX] mov INP, [rsp + _INP] add INP, 64 addm([4*0 + CTX],a) addm([4*1 + CTX],b) addm([4*2 + CTX],c) addm([4*3 + CTX],d) addm([4*4 + CTX],e) addm([4*5 + CTX],f) addm([4*6 + CTX],g) addm([4*7 + CTX],h) cmp INP, [rsp + _INP_END] jb .Loop0 ja .Ldone_hash .Ldo_last_block: /* ;;; do last block */ lea TBL, [.LK256 ADD_RIP] VMOVDQ XWORD0, [INP + 0*16] VMOVDQ XWORD1, [INP + 1*16] VMOVDQ XWORD2, [INP + 2*16] VMOVDQ XWORD3, [INP + 3*16] vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK jmp .Last_block_enter .Lonly_one_block: /* ; load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] mov [rsp + _CTX], CTX jmp .Ldo_last_block .Ldone_hash: vzeroall /* burn stack */ vmovdqa [rsp + _XFER + 0 * 32], ymm0 vmovdqa [rsp + _XFER + 1 * 32], ymm0 vmovdqa [rsp + _XFER + 2 * 32], ymm0 vmovdqa [rsp + _XFER + 3 * 32], ymm0 vmovdqa [rsp + _XFER + 4 * 32], ymm0 vmovdqa [rsp + _XFER + 5 * 32], ymm0 vmovdqa [rsp + _XFER + 6 * 32], ymm0 vmovdqa [rsp + _XFER + 7 * 32], ymm0 vmovdqa [rsp + _XFER + 8 * 32], ymm0 vmovdqa [rsp + _XFER + 9 * 32], ymm0 vmovdqa [rsp + _XFER + 10 * 32], ymm0 vmovdqa [rsp + _XFER + 11 * 32], ymm0 vmovdqa [rsp + _XFER + 12 * 32], ymm0 vmovdqa [rsp + _XFER + 13 * 32], ymm0 vmovdqa [rsp + _XFER + 14 * 32], ymm0 vmovdqa [rsp + _XFER + 15 * 32], ymm0 xor eax, eax mov rsp, [rsp + _RSP] CFI_DEF_CFA_REGISTER(rsp) pop r15 CFI_POP(r15) pop r14 CFI_POP(r14) pop r13 CFI_POP(r13) pop r12 CFI_POP(r12) pop rbp CFI_POP(rbp) pop rbx CFI_POP(rbx) .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() .align 64 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 /* shuffle xBxA -> 00BA */ .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 /* shuffle xDxC -> DC00 */ .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 098b0eb6..401ff6f4 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -1,528 +1,528 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; This code is described in an Intel White-Paper: ; "Fast SHA-256 Implementations on Intel Architecture Processors" ; ; To find it, surf to http://www.intel.com/p/en_US/embedded ; and search for that title. ; The paper is expected to be released roughly at the end of April, 2012 ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna * * Note: original implementation was named as SHA256-SSE4. However, only SSSE3 * is required. */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256) #include "asm-common-amd64.h" .intel_syntax noprefix #define MOVDQ movdqu /* assume buffers not aligned */ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ /* addm [mem], reg * Add reg to mem using reg-mem add and store */ #define addm(p1, p2) \ add p2, p1; \ mov p1, p2; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask * Load xmm with mem and byte swap each dword */ #define COPY_XMM_AND_BSWAP(p1, p2, p3) \ MOVDQ p1, p2; \ pshufb p1, p3; /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ #define X0 xmm4 #define X1 xmm5 #define X2 xmm6 #define X3 xmm7 #define XTMP0 xmm0 #define XTMP1 xmm1 #define XTMP2 xmm2 #define XTMP3 xmm3 #define XTMP4 xmm8 #define XFER xmm9 #define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ #define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ #define BYTE_FLIP_MASK xmm12 #define NUM_BLKS rdx /* 3rd arg */ #define CTX rsi /* 2nd arg */ #define INP rdi /* 1st arg */ #define SRND rdi /* clobbers INP */ #define c ecx #define d r8d #define e edx #define TBL rbp #define a eax #define b ebx #define f r9d #define g r10d #define h r11d #define y0 r13d #define y1 r14d #define y2 r15d #define _INP_END_SIZE 8 #define _INP_SIZE 8 #define _XFER_SIZE 8 #define _XMM_SAVE_SIZE 0 /* STACK_SIZE plus pushes must be an odd multiple of 8 */ #define _ALIGN_SIZE 8 #define _INP_END 0 #define _INP (_INP_END + _INP_END_SIZE) #define _XFER (_INP + _INP_SIZE) #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) #define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ /* compute s0 four at a time and s1 two at a time */; \ /* compute W[-16] + W[-7] 4 at a time */; \ movdqa XTMP0, X3; \ mov y0, e /* y0 = e */; \ ror y0, (25-11) /* y0 = e >> (25-11) */; \ mov y1, a /* y1 = a */; \ palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \ ror y1, (22-13) /* y1 = a >> (22-13) */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ mov y2, f /* y2 = f */; \ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ movdqa XTMP1, X1; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ xor y2, g /* y2 = f^g */; \ paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ /* compute s0 */; \ palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, y0 /* y2 = S1 + CH */; \ add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ pslld XTMP1, (32-7); \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ psrld XTMP2, 7; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \ mov y0, e /* y0 = e */; \ mov y1, a /* y1 = a */; \ movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \ ror y0, (25-11) /* y0 = e >> (25-11) */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ mov y2, f /* y2 = f */; \ ror y1, (22-13) /* y1 = a >> (22-13) */; \ pslld XTMP3, (32-18); \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ xor y2, g /* y2 = f^g */; \ psrld XTMP2, 18; \ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ pxor XTMP1, XTMP3; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \ add y2, y0 /* y2 = S1 + CH */; \ add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ /* compute low s1 */; \ pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \ mov y0, e /* y0 = e */; \ mov y1, a /* y1 = a */; \ ror y0, (25-11) /* y0 = e >> (25-11) */; \ movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ ror y1, (22-13) /* y1 = a >> (22-13) */; \ mov y2, f /* y2 = f */; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ xor y2, g /* y2 = f^g */; \ psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ pxor XTMP2, XTMP3; \ add y2, y0 /* y2 = S1 + CH */; \ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ /* compute high s1 */; \ pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \ mov y0, e /* y0 = e */; \ ror y0, (25-11) /* y0 = e >> (25-11) */; \ mov y1, a /* y1 = a */; \ movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */; \ ror y1, (22-13) /* y1 = a >> (22-13) */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ mov y2, f /* y2 = f */; \ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ xor y2, g /* y2 = f^g */; \ psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ and y2, e /* y2 = (f^g)&e */; \ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ pxor XTMP2, XTMP3; \ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, y0 /* y2 = S1 + CH */; \ add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ #define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); /* input is [rsp + _XFER + %1 * 4] */ #define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ mov y0, e /* y0 = e */; \ ror y0, (25-11) /* y0 = e >> (25-11) */; \ mov y1, a /* y1 = a */; \ xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ ror y1, (22-13) /* y1 = a >> (22-13) */; \ mov y2, f /* y2 = f */; \ xor y1, a /* y1 = a ^ (a >> (22-13) */; \ ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ xor y2, g /* y2 = f^g */; \ xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ and y2, e /* y2 = (f^g)&e */; \ xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ add y2, y0 /* y2 = S1 + CH */; \ ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ mov y0, a /* y0 = a */; \ add h, y2 /* h = h + S1 + CH + k + w */; \ mov y2, a /* y2 = a */; \ or y0, c /* y0 = a|c */; \ add d, h /* d = d + h + S1 + CH + k + w */; \ and y2, c /* y2 = a&c */; \ and y0, b /* y0 = (a|c)&b */; \ add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest ;; arg 3 : Num blocks */ .text .globl _gcry_sha256_transform_amd64_ssse3 ELF(.type _gcry_sha256_transform_amd64_ssse3,@function;) .align 16 _gcry_sha256_transform_amd64_ssse3: CFI_STARTPROC() push rbx CFI_PUSH(rbx) push rbp CFI_PUSH(rbp) push r13 CFI_PUSH(r13) push r14 CFI_PUSH(r14) push r15 CFI_PUSH(r15) sub rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(STACK_SIZE); shl NUM_BLKS, 6 /* convert to bytes */ jz .Ldone_hash add NUM_BLKS, INP /* pointer to end of data */ mov [rsp + _INP_END], NUM_BLKS /* load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] mov c,[4*2 + CTX] mov d,[4*3 + CTX] mov e,[4*4 + CTX] mov f,[4*5 + CTX] mov g,[4*6 + CTX] mov h,[4*7 + CTX] movdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] movdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] movdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] .Loop0: lea TBL, [.LK256 ADD_RIP] /* byte swap first 16 dwords */ COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) mov [rsp + _INP], INP /* schedule 48 input dwords, by doing 3 rounds of 16 each */ mov SRND, 3 .align 16 .Loop1: movdqa XFER, [TBL + 0*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 1*16] paddd XFER, X1 movdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) movdqa XFER, [TBL + 2*16] paddd XFER, X2 movdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) movdqa XFER, [TBL + 3*16] paddd XFER, X3 movdqa [rsp + _XFER], XFER add TBL, 4*16 FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) sub SRND, 1 jne .Loop1 mov SRND, 2 .Loop2: paddd X0, [TBL + 0*16] movdqa [rsp + _XFER], X0 DO_ROUND(0, a, b, c, d, e, f, g, h) DO_ROUND(1, h, a, b, c, d, e, f, g) DO_ROUND(2, g, h, a, b, c, d, e, f) DO_ROUND(3, f, g, h, a, b, c, d, e) paddd X1, [TBL + 1*16] movdqa [rsp + _XFER], X1 add TBL, 2*16 DO_ROUND(0, e, f, g, h, a, b, c, d) DO_ROUND(1, d, e, f, g, h, a, b, c) DO_ROUND(2, c, d, e, f, g, h, a, b) DO_ROUND(3, b, c, d, e, f, g, h, a) movdqa X0, X2 movdqa X1, X3 sub SRND, 1 jne .Loop2 addm([4*0 + CTX],a) addm([4*1 + CTX],b) addm([4*2 + CTX],c) addm([4*3 + CTX],d) addm([4*4 + CTX],e) addm([4*5 + CTX],f) addm([4*6 + CTX],g) addm([4*7 + CTX],h) mov INP, [rsp + _INP] add INP, 64 cmp INP, [rsp + _INP_END] jne .Loop0 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 pxor xmm3, xmm3 pxor xmm4, xmm4 pxor xmm5, xmm5 pxor xmm6, xmm6 pxor xmm7, xmm7 pxor xmm8, xmm8 pxor xmm9, xmm9 pxor xmm10, xmm10 pxor xmm11, xmm11 pxor xmm12, xmm12 .Ldone_hash: pxor XFER, XFER movdqa [rsp + _XFER], XFER xor eax, eax add rsp, STACK_SIZE CFI_ADJUST_CFA_OFFSET(-STACK_SIZE); pop r15 CFI_POP(r15) pop r14 CFI_POP(r14) pop r13 CFI_POP(r13) pop rbp CFI_POP(rbp) pop rbx CFI_POP(rbx) - ret + ret_spec_stop CFI_ENDPROC() .align 16 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 /* shuffle xBxA -> 00BA */ .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 /* shuffle xDxC -> DC00 */ .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 75f7b070..bfc4435d 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -1,461 +1,461 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512) #include "asm-common-amd64.h" .intel_syntax noprefix .text /* Virtual Registers */ #define msg rdi /* ARG1 */ #define digest rsi /* ARG2 */ #define msglen rdx /* ARG3 */ #define T1 rcx #define T2 r8 #define a_64 r9 #define b_64 r10 #define c_64 r11 #define d_64 r12 #define e_64 r13 #define f_64 r14 #define g_64 r15 #define h_64 rbx #define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ #define frame_W 0 /* Message Schedule */ #define frame_W_size (80 * 8) #define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ #define frame_WK_size (2 * 8) #define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) #define frame_GPRSAVE_size (5 * 8) #define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ #define MSG(i) msg + 8*(i) /* Input message (arg1) */ #define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ #define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ #define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ #define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ #define RORQ(p1, p2) \ /* shld is faster than ror on Intel Sandybridge */ \ shld p1, p1, (64 - p2) #define SHA512_Round(t, a, b, c, d, e, f, g, h) \ /* Compute Round %%t */; \ mov T1, f /* T1 = f */; \ mov tmp0, e /* tmp = e */; \ xor T1, g /* T1 = f ^ g */; \ RORQ( tmp0, 23) /* 41 ; tmp = e ror 23 */; \ and T1, e /* T1 = (f ^ g) & e */; \ xor tmp0, e /* tmp = (e ror 23) ^ e */; \ xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ RORQ( tmp0, 4) /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ mov T2, a /* T2 = a */; \ add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ RORQ( tmp0, 14) /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ mov tmp0, a /* tmp = a */; \ xor T2, c /* T2 = a ^ c */; \ and tmp0, c /* tmp = a & c */; \ and T2, b /* T2 = (a ^ c) & b */; \ xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ mov tmp0, a /* tmp = a */; \ RORQ( tmp0, 5) /* 39 ; tmp = a ror 5 */; \ xor tmp0, a /* tmp = (a ror 5) ^ a */; \ add d, T1 /* e(next_state) = d + T1 */; \ RORQ( tmp0, 6) /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ RORQ( tmp0, 28) /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ #define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \ /* \ ; Compute rounds %%t-2 and %%t-1 \ ; Compute message schedule QWORDS %%t and %%t+1 \ ; \ ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ ; scheduler. \ ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ ; They are then added to their respective SHA512 constants at \ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ ; For brievity, the comments following vectored instructions only refer to \ ; the first of a pair of QWORDS. \ ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \ ; The computation of the message schedule and the rounds are tightly \ ; stitched to take advantage of instruction-level parallelism. \ ; For clarity, integer instructions (for the rounds calculation) are indented \ ; by one tab. Vectored instructions (for the message scheduler) are indented \ ; by two tabs. \ */ \ \ vmovdqa xmm4, [W_t(t-2)] /* XMM4 = W[t-2] */; \ vmovdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ mov T1, f; \ vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */; \ mov tmp0, e; \ vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */; \ xor T1, g; \ RORQ( tmp0, 23) /* 41 */; \ vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */; \ and T1, e; \ xor tmp0, e; \ vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \ xor T1, g; \ add T1, [WK_2(t)]; \ vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */; \ RORQ( tmp0, 4) /* 18 */; \ vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */; \ xor tmp0, e; \ mov T2, a; \ add T1, h; \ vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \ RORQ( tmp0, 14) /* 14 */; \ add T1, tmp0; \ vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */; \ mov tmp0, a; \ xor T2, c; \ vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */; \ and tmp0, c; \ and T2, b; \ vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \ xor T2, tmp0; \ mov tmp0, a; \ vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */; \ RORQ( tmp0, 5) /* 39 */; \ vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \ xor tmp0, a; \ add d, T1; \ RORQ( tmp0, 6) /* 34 */; \ xor tmp0, a; \ vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \ lea h, [T1 + T2]; \ RORQ( tmp0, 28) /* 28 */; \ vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */; \ add h, tmp0 #define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \ vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \ mov T1, f; \ vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */; \ mov tmp0, e; \ xor T1, g; \ vpaddq xmm0, xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */; \ vmovdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ RORQ( tmp0, 23) /* 41 */; \ and T1, e; \ xor tmp0, e; \ xor T1, g; \ vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */; \ add T1, [WK_2(t+1)]; \ vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */; \ RORQ( tmp0, 4) /* 18 */; \ vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \ xor tmp0, e; \ vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ mov T2, a; \ add T1, h; \ RORQ( tmp0, 14) /* 14 */; \ add T1, tmp0; \ vmovdqa [W_t(t)], xmm0 /* Store W[t] */; \ vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ mov tmp0, a; \ xor T2, c; \ and tmp0, c; \ and T2, b; \ xor T2, tmp0; \ mov tmp0, a; \ RORQ( tmp0, 5) /* 39 */; \ xor tmp0, a; \ add d, T1; \ RORQ( tmp0, 6) /* 34 */; \ xor tmp0, a; \ lea h, [T1 + T2]; \ RORQ( tmp0, 28) /* 28 */; \ add h, tmp0 #define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \ SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \ SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha512_avx(const void* M, void* D, uint64_t L); ; Purpose: Updates the SHA512 digest stored at D with the message stored in M. ; The size of the message pointed to by M must be an integer multiple of SHA512 ; message blocks. ; L is the message length in SHA512 blocks */ .globl _gcry_sha512_transform_amd64_avx ELF(.type _gcry_sha512_transform_amd64_avx,@function;) .align 16 _gcry_sha512_transform_amd64_avx: CFI_STARTPROC() xor eax, eax cmp msglen, 0 je .Lnowork vzeroupper /* Allocate Stack Space */ sub rsp, frame_size CFI_ADJUST_CFA_OFFSET(frame_size); /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbx mov [rsp + frame_GPRSAVE + 8 * 1], r12 mov [rsp + frame_GPRSAVE + 8 * 2], r13 mov [rsp + frame_GPRSAVE + 8 * 3], r14 mov [rsp + frame_GPRSAVE + 8 * 4], r15 CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0); CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1); CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2); CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3); CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4); .Lupdateblock: /* Load state variables */ mov a_64, [DIGEST(0)] mov b_64, [DIGEST(1)] mov c_64, [DIGEST(2)] mov d_64, [DIGEST(3)] mov e_64, [DIGEST(4)] mov f_64, [DIGEST(5)] mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] /* BSWAP 2 QWORDS */ vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] vmovdqu xmm0, [MSG(0)] vpshufb xmm0, xmm0, xmm1 /* BSWAP */ vmovdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ vpaddq xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */ vmovdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ #define T_2_14(t, a, b, c, d, e, f, g, h) \ /* BSWAP 2 QWORDS, Compute 2 Rounds */; \ vmovdqu xmm0, [MSG(t)]; \ vpshufb xmm0, xmm0, xmm1 /* BSWAP */; \ SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ e##_64, f##_64, g##_64, h##_64); \ vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ d##_64, e##_64, f##_64, g##_64); \ vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ #define T_16_78(t, a, b, c, d, e, f, g, h) \ SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \ e##_64, f##_64, g##_64, h##_64) #define T_80(t, a, b, c, d, e, f, g, h) \ /* Compute 2 Rounds */; \ SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ e##_64, f##_64, g##_64, h##_64); \ SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ d##_64, e##_64, f##_64, g##_64) T_2_14(2, a, b, c, d, e, f, g, h) T_2_14(4, g, h, a, b, c, d, e, f) T_2_14(6, e, f, g, h, a, b, c, d) T_2_14(8, c, d, e, f, g, h, a, b) T_2_14(10, a, b, c, d, e, f, g, h) T_2_14(12, g, h, a, b, c, d, e, f) T_2_14(14, e, f, g, h, a, b, c, d) T_16_78(16, c, d, e, f, g, h, a, b) T_16_78(18, a, b, c, d, e, f, g, h) T_16_78(20, g, h, a, b, c, d, e, f) T_16_78(22, e, f, g, h, a, b, c, d) T_16_78(24, c, d, e, f, g, h, a, b) T_16_78(26, a, b, c, d, e, f, g, h) T_16_78(28, g, h, a, b, c, d, e, f) T_16_78(30, e, f, g, h, a, b, c, d) T_16_78(32, c, d, e, f, g, h, a, b) T_16_78(34, a, b, c, d, e, f, g, h) T_16_78(36, g, h, a, b, c, d, e, f) T_16_78(38, e, f, g, h, a, b, c, d) T_16_78(40, c, d, e, f, g, h, a, b) T_16_78(42, a, b, c, d, e, f, g, h) T_16_78(44, g, h, a, b, c, d, e, f) T_16_78(46, e, f, g, h, a, b, c, d) T_16_78(48, c, d, e, f, g, h, a, b) T_16_78(50, a, b, c, d, e, f, g, h) T_16_78(52, g, h, a, b, c, d, e, f) T_16_78(54, e, f, g, h, a, b, c, d) T_16_78(56, c, d, e, f, g, h, a, b) T_16_78(58, a, b, c, d, e, f, g, h) T_16_78(60, g, h, a, b, c, d, e, f) T_16_78(62, e, f, g, h, a, b, c, d) T_16_78(64, c, d, e, f, g, h, a, b) T_16_78(66, a, b, c, d, e, f, g, h) T_16_78(68, g, h, a, b, c, d, e, f) T_16_78(70, e, f, g, h, a, b, c, d) T_16_78(72, c, d, e, f, g, h, a, b) T_16_78(74, a, b, c, d, e, f, g, h) T_16_78(76, g, h, a, b, c, d, e, f) T_16_78(78, e, f, g, h, a, b, c, d) T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 add [DIGEST(1)], b_64 add [DIGEST(2)], c_64 add [DIGEST(3)], d_64 add [DIGEST(4)], e_64 add [DIGEST(5)], f_64 add [DIGEST(6)], g_64 add [DIGEST(7)], h_64 /* Advance to next message block */ add msg, 16*8 dec msglen jnz .Lupdateblock /* Restore GPRs */ mov rbx, [rsp + frame_GPRSAVE + 8 * 0] mov r12, [rsp + frame_GPRSAVE + 8 * 1] mov r13, [rsp + frame_GPRSAVE + 8 * 2] mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] CFI_RESTORE(rbx) CFI_RESTORE(r12) CFI_RESTORE(r13) CFI_RESTORE(r14) CFI_RESTORE(r15) vzeroall /* Burn stack */ mov eax, 0 .Lerase_stack: vmovdqu [rsp + rax], ymm0 add eax, 32 cmp eax, frame_W_size jne .Lerase_stack vmovdqu [rsp + frame_WK], xmm0 xor eax, eax /* Restore Stack Pointer */ add rsp, frame_size CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Binary Data */ .align 16 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ .LXMM_QWORD_BSWAP: .octa 0x08090a0b0c0d0e0f0001020304050607 /* K[t] used in SHA512 hashing */ .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif #endif diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 7f119e6c..a431e196 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -1,502 +1,502 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(USE_SHA512) #include "asm-common-amd64.h" .intel_syntax noprefix .text /* Virtual Registers */ #define Y_0 ymm4 #define Y_1 ymm5 #define Y_2 ymm6 #define Y_3 ymm7 #define YTMP0 ymm0 #define YTMP1 ymm1 #define YTMP2 ymm2 #define YTMP3 ymm3 #define YTMP4 ymm8 #define XFER YTMP0 #define BYTE_FLIP_MASK ymm9 #define MASK_YMM_LO ymm10 #define MASK_YMM_LOx xmm10 #define INP rdi /* 1st arg */ #define CTX rsi /* 2nd arg */ #define NUM_BLKS rdx /* 3rd arg */ #define c rcx #define d r8 #define e rdx #define y3 rdi #define TBL rbp #define a rax #define b rbx #define f r9 #define g r10 #define h r11 #define T1 r12 #define y0 r13 #define y1 r14 #define y2 r15 #define y4 r12 /* Local variables (stack frame) */ #define frame_XFER 0 #define frame_XFER_size (4*4*8) #define frame_SRND (frame_XFER + frame_XFER_size) #define frame_SRND_size (1*8) #define frame_INP (frame_SRND + frame_SRND_size) #define frame_INP_size (1*8) #define frame_NBLKS (frame_INP + frame_INP_size) #define frame_NBLKS_size (1*8) #define frame_RSPSAVE (frame_NBLKS + frame_NBLKS_size) #define frame_RSPSAVE_size (1*8) #define frame_GPRSAVE (frame_RSPSAVE + frame_RSPSAVE_size) #define frame_GPRSAVE_size (6*8) #define frame_size (frame_GPRSAVE + frame_GPRSAVE_size) #define VMOVDQ vmovdqu /*; assume buffers not aligned */ /* addm [mem], reg */ /* Add reg to mem using reg-mem add and store */ #define addm(p1, p2) \ add p2, p1; \ mov p1, p2; /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */ /* Load ymm with mem and byte swap each dword */ #define COPY_YMM_AND_BSWAP(p1, p2, p3) \ VMOVDQ p1, p2; \ vpshufb p1, p1, p3 /* %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL */ /* YDST = {YSRC1, YSRC2} >> RVAL*8 */ #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \ vperm2i128 YDST, YSRC1, YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */; \ vpalignr YDST, YDST, YSRC2, RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ #define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \ /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \ * d += h; \ * h += Sum0 (a) + Maj (a, b, c); \ * \ * Ch(x, y, z) => ((x & y) + (~x & z)) \ * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \ */ \ \ mov y3, e; \ add h, [XFERIN]; \ and y3, f; \ rorx y0, e, 41; \ rorx y1, e, 18; \ lea h, [h + y3]; \ andn y3, e, g; \ rorx T1, a, 34; \ xor y0, y1; \ lea h, [h + y3] #define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \ rorx y2, a, 39; \ rorx y1, e, 14; \ mov y3, a; \ xor T1, y2; \ xor y0, y1; \ xor y3, b; \ lea h, [h + y0]; \ mov y0, a; \ rorx y2, a, 28; \ add d, h; \ and y3, c; \ xor T1, y2; \ lea h, [h + y3]; \ lea h, [h + T1]; \ and y0, b; \ lea h, [h + y0] #define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \ ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \ ONE_ROUND_PART2(a, b, c, d, e, f, g, h) #define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ /* Extract w[t-7] */; \ MY_VPALIGNR( YTMP0, Y_3, Y_2, 8) /* YTMP0 = W[-7] */; \ /* Calculate w[t-16] + w[t-7] */; \ vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */; \ /* Extract w[t-15] */; \ MY_VPALIGNR( YTMP1, Y_1, Y_0, 8) /* YTMP1 = W[-15] */; \ \ /* Calculate sigma0 */; \ \ /* Calculate w[t-15] ror 1 */; \ vpsrlq YTMP2, YTMP1, 1; \ vpsllq YTMP3, YTMP1, (64-1); \ vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */; \ /* Calculate w[t-15] shr 7 */; \ vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */; \ \ ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ /* Calculate w[t-15] ror 8 */; \ vpsrlq YTMP2, YTMP1, 8; \ vpsllq YTMP1, YTMP1, (64-8); \ vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */; \ /* XOR the three components */; \ vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */; \ vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */; \ \ /* Add three components, w[t-16], w[t-7] and sigma0 */; \ vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */; \ /* Move to appropriate lanes for calculating w[16] and w[17] */; \ vperm2i128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \ /* Move to appropriate lanes for calculating w[18] and w[19] */; \ vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */; \ \ /* Calculate w[16] and w[17] in both 128 bit lanes */; \ \ /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \ vperm2i128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */; \ vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */; \ \ ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */; \ vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */; \ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */; \ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */; \ vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */; \ vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */; \ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */; \ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \ \ /* Add sigma1 to the other compunents to get w[16] and w[17] */; \ vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */; \ \ /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \ vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */; \ \ ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ \ /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \ vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */; \ vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */; \ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */; \ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */; \ vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */; \ vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */; \ vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */; \ vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \ \ /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \ vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */; \ \ /* Form w[19, w[18], w17], w[16] */; \ vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */; \ \ ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \ vpaddq XFER, Y_0, [TBL + (4+X)*32]; \ vmovdqa [rsp + frame_XFER + X*32], XFER; \ ONE_ROUND_PART2(f, g, h, a, b, c, d, e) #define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \ ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \ ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \ ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \ ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha512_rorx(const void* M, void* D, uint64_t L); ; Purpose: Updates the SHA512 digest stored at D with the message stored in M. ; The size of the message pointed to by M must be an integer multiple of SHA512 ; message blocks. ; L is the message length in SHA512 blocks */ .globl _gcry_sha512_transform_amd64_avx2 ELF(.type _gcry_sha512_transform_amd64_avx2,@function;) .align 16 _gcry_sha512_transform_amd64_avx2: CFI_STARTPROC() xor eax, eax cmp rdx, 0 je .Lnowork vzeroupper /* Allocate Stack Space */ mov rax, rsp CFI_DEF_CFA_REGISTER(rax); sub rsp, frame_size and rsp, ~(0x40 - 1) mov [rsp + frame_RSPSAVE], rax CFI_CFA_ON_STACK(frame_RSPSAVE, 0) /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbp mov [rsp + frame_GPRSAVE + 8 * 1], rbx mov [rsp + frame_GPRSAVE + 8 * 2], r12 mov [rsp + frame_GPRSAVE + 8 * 3], r13 mov [rsp + frame_GPRSAVE + 8 * 4], r14 mov [rsp + frame_GPRSAVE + 8 * 5], r15 CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0) CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1) CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2) CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3) CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4) CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5) mov [rsp + frame_NBLKS], NUM_BLKS /*; load initial digest */ mov a,[8*0 + CTX] mov b,[8*1 + CTX] mov c,[8*2 + CTX] mov d,[8*3 + CTX] mov e,[8*4 + CTX] mov f,[8*5 + CTX] mov g,[8*6 + CTX] mov h,[8*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] vmovdqa MASK_YMM_LO, [.LMASK_YMM_LO ADD_RIP] lea TBL,[.LK512 ADD_RIP] /*; byte swap first 16 dwords */ COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) add INP, 128 mov [rsp + frame_INP], INP vpaddq XFER, Y_0, [TBL + 0*32] vmovdqa [rsp + frame_XFER + 0*32], XFER vpaddq XFER, Y_1, [TBL + 1*32] vmovdqa [rsp + frame_XFER + 1*32], XFER vpaddq XFER, Y_2, [TBL + 2*32] vmovdqa [rsp + frame_XFER + 2*32], XFER vpaddq XFER, Y_3, [TBL + 3*32] vmovdqa [rsp + frame_XFER + 3*32], XFER /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ mov qword ptr [rsp + frame_SRND], 4 .align 16 .Loop0: FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d) FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h) FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d) add TBL, 4*32 sub qword ptr [rsp + frame_SRND], 1 jne .Loop0 sub qword ptr [rsp + frame_NBLKS], 1 je .Ldone_hash mov INP, [rsp + frame_INP] lea TBL,[.LK512 ADD_RIP] /* load next block and byte swap */ COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK) COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK) add INP, 128 mov [rsp + frame_INP], INP DO_4ROUNDS(0, a, b, c, d, e, f, g, h) vpaddq XFER, Y_0, [TBL + 0*32] vmovdqa [rsp + frame_XFER + 0*32], XFER DO_4ROUNDS(1, e, f, g, h, a, b, c, d) vpaddq XFER, Y_1, [TBL + 1*32] vmovdqa [rsp + frame_XFER + 1*32], XFER DO_4ROUNDS(2, a, b, c, d, e, f, g, h) vpaddq XFER, Y_2, [TBL + 2*32] vmovdqa [rsp + frame_XFER + 2*32], XFER DO_4ROUNDS(3, e, f, g, h, a, b, c, d) vpaddq XFER, Y_3, [TBL + 3*32] vmovdqa [rsp + frame_XFER + 3*32], XFER addm([8*0 + CTX],a) addm([8*1 + CTX],b) addm([8*2 + CTX],c) addm([8*3 + CTX],d) addm([8*4 + CTX],e) addm([8*5 + CTX],f) addm([8*6 + CTX],g) addm([8*7 + CTX],h) /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ mov qword ptr [rsp + frame_SRND],4 jmp .Loop0 .Ldone_hash: vzeroall DO_4ROUNDS(0, a, b, c, d, e, f, g, h) vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */ DO_4ROUNDS(1, e, f, g, h, a, b, c, d) vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */ DO_4ROUNDS(2, a, b, c, d, e, f, g, h) vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */ DO_4ROUNDS(3, e, f, g, h, a, b, c, d) vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */ addm([8*0 + CTX],a) xor eax, eax /* burn stack */ addm([8*1 + CTX],b) addm([8*2 + CTX],c) addm([8*3 + CTX],d) addm([8*4 + CTX],e) addm([8*5 + CTX],f) addm([8*6 + CTX],g) addm([8*7 + CTX],h) /* Restore GPRs */ mov rbp, [rsp + frame_GPRSAVE + 8 * 0] mov rbx, [rsp + frame_GPRSAVE + 8 * 1] mov r12, [rsp + frame_GPRSAVE + 8 * 2] mov r13, [rsp + frame_GPRSAVE + 8 * 3] mov r14, [rsp + frame_GPRSAVE + 8 * 4] mov r15, [rsp + frame_GPRSAVE + 8 * 5] CFI_RESTORE(rbp) CFI_RESTORE(rbx) CFI_RESTORE(r12) CFI_RESTORE(r13) CFI_RESTORE(r14) CFI_RESTORE(r15) /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] CFI_DEF_CFA_REGISTER(rsp) .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /*;; Binary Data */ .align 64 /* K[t] used in SHA512 hashing */ .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .align 32 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 .octa 0x18191a1b1c1d1e1f1011121314151617 .LMASK_YMM_LO: .octa 0x00000000000000000000000000000000 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF #endif #endif diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 6a1328a6..9cc30892 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -1,467 +1,467 @@ /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright (c) 2012, Intel Corporation ; ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are ; met: ; ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the ; distribution. ; ; * Neither the name of the Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* * Conversion to GAS assembly and integration to libgcrypt * by Jussi Kivilinna * * Note: original implementation was named as SHA512-SSE4. However, only SSSE3 * is required. */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512) #include "asm-common-amd64.h" .intel_syntax noprefix .text /* Virtual Registers */ #define msg rdi /* ARG1 */ #define digest rsi /* ARG2 */ #define msglen rdx /* ARG3 */ #define T1 rcx #define T2 r8 #define a_64 r9 #define b_64 r10 #define c_64 r11 #define d_64 r12 #define e_64 r13 #define f_64 r14 #define g_64 r15 #define h_64 rbx #define tmp0 rax /* ; Local variables (stack frame) ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP */ #define frame_W 0 /* Message Schedule */ #define frame_W_size (80 * 8) #define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ #define frame_WK_size (2 * 8) #define frame_GPRSAVE ((frame_WK) + (frame_WK_size)) #define frame_GPRSAVE_size (5 * 8) #define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size)) /* Useful QWORD "arrays" for simpler memory references */ #define MSG(i) msg + 8*(i) /* Input message (arg1) */ #define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ #define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ #define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ #define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ /* MSG, DIGEST, K_t, W_t are arrays */ /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ #define SHA512_Round(t, a, b, c, d, e, f, g, h) \ /* Compute Round %%t */; \ mov T1, f /* T1 = f */; \ mov tmp0, e /* tmp = e */; \ xor T1, g /* T1 = f ^ g */; \ ror tmp0, 23 /* 41 ; tmp = e ror 23 */; \ and T1, e /* T1 = (f ^ g) & e */; \ xor tmp0, e /* tmp = (e ror 23) ^ e */; \ xor T1, g /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \ add T1, [WK_2(t)] /* W[t] + K[t] from message scheduler */; \ ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */; \ xor tmp0, e /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \ mov T2, a /* T2 = a */; \ add T1, h /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \ ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \ add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \ mov tmp0, a /* tmp = a */; \ xor T2, c /* T2 = a ^ c */; \ and tmp0, c /* tmp = a & c */; \ and T2, b /* T2 = (a ^ c) & b */; \ xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \ mov tmp0, a /* tmp = a */; \ ror tmp0, 5 /* 39 ; tmp = a ror 5 */; \ xor tmp0, a /* tmp = (a ror 5) ^ a */; \ add d, T1 /* e(next_state) = d + T1 */; \ ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */; \ xor tmp0, a /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \ lea h, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */; \ ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \ add h, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ #define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \ /* \ ; Compute rounds %%t-2 and %%t-1 \ ; Compute message schedule QWORDS %%t and %%t+1 \ ; \ ; Two rounds are computed based on the values for K[t-2]+W[t-2] and \ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \ ; scheduler. \ ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \ ; They are then added to their respective SHA512 constants at \ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \ ; For brievity, the comments following vectored instructions only refer to \ ; the first of a pair of QWORDS. \ ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \ ; The computation of the message schedule and the rounds are tightly \ ; stitched to take advantage of instruction-level parallelism. \ ; For clarity, integer instructions (for the rounds calculation) are indented \ ; by one tab. Vectored instructions (for the message scheduler) are indented \ ; by two tabs. \ */ \ \ mov T1, f; \ movdqa xmm2, [W_t(t-2)] /* XMM2 = W[t-2] */; \ xor T1, g; \ and T1, e; \ movdqa xmm0, xmm2 /* XMM0 = W[t-2] */; \ xor T1, g; \ add T1, [WK_2(t)]; \ movdqu xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \ mov tmp0, e; \ ror tmp0, 23 /* 41 */; \ movdqa xmm3, xmm5 /* XMM3 = W[t-15] */; \ xor tmp0, e; \ ror tmp0, 4 /* 18 */; \ psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */; \ xor tmp0, e; \ ror tmp0, 14 /* 14 */; \ psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */; \ add T1, tmp0; \ add T1, h; \ pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \ mov T2, a; \ xor T2, c; \ pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \ and T2, b; \ mov tmp0, a; \ psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \ and tmp0, c; \ xor T2, tmp0; \ psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \ mov tmp0, a; \ ror tmp0, 5 /* 39 */; \ pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \ xor tmp0, a; \ ror tmp0, 6 /* 34 */; \ pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \ xor tmp0, a; \ ror tmp0, 28 /* 28 */; \ psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \ add T2, tmp0; \ add d, T1; \ psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \ lea h, [T1 + T2] #define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \ movdqa xmm1, xmm2 /* XMM1 = W[t-2] */; \ mov T1, f; \ xor T1, g; \ movdqa xmm4, xmm5 /* XMM4 = W[t-15] */; \ and T1, e; \ xor T1, g; \ psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \ add T1, [WK_2(t+1)]; \ mov tmp0, e; \ psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \ ror tmp0, 23 /* 41 */; \ xor tmp0, e; \ pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */; \ ror tmp0, 4 /* 18 */; \ xor tmp0, e; \ pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */; \ ror tmp0, 14 /* 14 */; \ add T1, tmp0; \ psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \ add T1, h; \ mov T2, a; \ psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \ xor T2, c; \ and T2, b; \ pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */; \ mov tmp0, a; \ and tmp0, c; \ movdqu xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \ xor T2, tmp0; \ pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */; \ mov tmp0, a; \ paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \ ror tmp0, 5 /* 39 */; \ paddq xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \ xor tmp0, a; \ paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \ ror tmp0, 6 /* 34 */; \ movdqa [W_t(t)], xmm0 /* Store scheduled qwords */; \ xor tmp0, a; \ paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ ror tmp0, 28 /* 28 */; \ movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */; \ add T2, tmp0; \ add d, T1; \ lea h, [T1 + T2] #define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \ SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \ SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g) /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha512_sse4(const void* M, void* D, uint64_t L); ; Purpose: Updates the SHA512 digest stored at D with the message stored in M. ; The size of the message pointed to by M must be an integer multiple of SHA512 ; message blocks. ; L is the message length in SHA512 blocks. */ .globl _gcry_sha512_transform_amd64_ssse3 ELF(.type _gcry_sha512_transform_amd64_ssse3,@function;) .align 16 _gcry_sha512_transform_amd64_ssse3: CFI_STARTPROC() xor eax, eax cmp msglen, 0 je .Lnowork /* Allocate Stack Space */ sub rsp, frame_size CFI_ADJUST_CFA_OFFSET(frame_size); /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbx mov [rsp + frame_GPRSAVE + 8 * 1], r12 mov [rsp + frame_GPRSAVE + 8 * 2], r13 mov [rsp + frame_GPRSAVE + 8 * 3], r14 mov [rsp + frame_GPRSAVE + 8 * 4], r15 CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0); CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1); CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2); CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3); CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4); .Lupdateblock: /* Load state variables */ mov a_64, [DIGEST(0)] mov b_64, [DIGEST(1)] mov c_64, [DIGEST(2)] mov d_64, [DIGEST(3)] mov e_64, [DIGEST(4)] mov f_64, [DIGEST(5)] mov g_64, [DIGEST(6)] mov h_64, [DIGEST(7)] /* BSWAP 2 QWORDS */ movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] movdqu xmm0, [MSG(0)] pshufb xmm0, xmm1 /* BSWAP */ movdqa [W_t(0)], xmm0 /* Store Scheduled Pair */ paddq xmm0, [K_t(0)] /* Compute W[t]+K[t] */ movdqa [WK_2(0)], xmm0 /* Store into WK for rounds */ #define T_2_14(t, a, b, c, d, e, f, g, h) \ /* BSWAP 2 QWORDS; Compute 2 Rounds */; \ movdqu xmm0, [MSG(t)]; \ pshufb xmm0, xmm1 /* BSWAP */; \ SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \ e##_64, f##_64, g##_64, h##_64); \ movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */; \ paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \ SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \ d##_64, e##_64, f##_64, g##_64); \ movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ #define T_16_78(t, a, b, c, d, e, f, g, h) \ SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \ e##_64, f##_64, g##_64, h##_64) #define T_80(t, a, b, c, d, e, f, g, h) \ /* Compute 2 Rounds */; \ SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \ e##_64, f##_64, g##_64, h##_64); \ SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \ d##_64, e##_64, f##_64, g##_64) T_2_14(2, a, b, c, d, e, f, g, h) T_2_14(4, g, h, a, b, c, d, e, f) T_2_14(6, e, f, g, h, a, b, c, d) T_2_14(8, c, d, e, f, g, h, a, b) T_2_14(10, a, b, c, d, e, f, g, h) T_2_14(12, g, h, a, b, c, d, e, f) T_2_14(14, e, f, g, h, a, b, c, d) T_16_78(16, c, d, e, f, g, h, a, b) T_16_78(18, a, b, c, d, e, f, g, h) T_16_78(20, g, h, a, b, c, d, e, f) T_16_78(22, e, f, g, h, a, b, c, d) T_16_78(24, c, d, e, f, g, h, a, b) T_16_78(26, a, b, c, d, e, f, g, h) T_16_78(28, g, h, a, b, c, d, e, f) T_16_78(30, e, f, g, h, a, b, c, d) T_16_78(32, c, d, e, f, g, h, a, b) T_16_78(34, a, b, c, d, e, f, g, h) T_16_78(36, g, h, a, b, c, d, e, f) T_16_78(38, e, f, g, h, a, b, c, d) T_16_78(40, c, d, e, f, g, h, a, b) T_16_78(42, a, b, c, d, e, f, g, h) T_16_78(44, g, h, a, b, c, d, e, f) T_16_78(46, e, f, g, h, a, b, c, d) T_16_78(48, c, d, e, f, g, h, a, b) T_16_78(50, a, b, c, d, e, f, g, h) T_16_78(52, g, h, a, b, c, d, e, f) T_16_78(54, e, f, g, h, a, b, c, d) T_16_78(56, c, d, e, f, g, h, a, b) T_16_78(58, a, b, c, d, e, f, g, h) T_16_78(60, g, h, a, b, c, d, e, f) T_16_78(62, e, f, g, h, a, b, c, d) T_16_78(64, c, d, e, f, g, h, a, b) T_16_78(66, a, b, c, d, e, f, g, h) T_16_78(68, g, h, a, b, c, d, e, f) T_16_78(70, e, f, g, h, a, b, c, d) T_16_78(72, c, d, e, f, g, h, a, b) T_16_78(74, a, b, c, d, e, f, g, h) T_16_78(76, g, h, a, b, c, d, e, f) T_16_78(78, e, f, g, h, a, b, c, d) T_80(80, c, d, e, f, g, h, a, b) /* Update digest */ add [DIGEST(0)], a_64 add [DIGEST(1)], b_64 add [DIGEST(2)], c_64 add [DIGEST(3)], d_64 add [DIGEST(4)], e_64 add [DIGEST(5)], f_64 add [DIGEST(6)], g_64 add [DIGEST(7)], h_64 /* Advance to next message block */ add msg, 16*8 dec msglen jnz .Lupdateblock /* Restore GPRs */ mov rbx, [rsp + frame_GPRSAVE + 8 * 0] mov r12, [rsp + frame_GPRSAVE + 8 * 1] mov r13, [rsp + frame_GPRSAVE + 8 * 2] mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] CFI_RESTORE(rbx) CFI_RESTORE(r12) CFI_RESTORE(r13) CFI_RESTORE(r14) CFI_RESTORE(r15) pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 pxor xmm3, xmm3 pxor xmm4, xmm4 pxor xmm5, xmm5 /* Burn stack */ mov eax, 0 .Lerase_stack: movdqu [rsp + rax], xmm0 add eax, 16 cmp eax, frame_W_size jne .Lerase_stack movdqu [rsp + frame_WK], xmm0 xor eax, eax /* Restore Stack Pointer */ add rsp, frame_size CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Binary Data */ .align 16 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ .LXMM_QWORD_BSWAP: .octa 0x08090a0b0c0d0e0f0001020304050607 /* K[t] used in SHA512 hashing */ .LK512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif #endif diff --git a/cipher/sm3-avx-bmi2-amd64.S b/cipher/sm3-avx-bmi2-amd64.S index 46226ae6..d9b6206a 100644 --- a/cipher/sm3-avx-bmi2-amd64.S +++ b/cipher/sm3-avx-bmi2-amd64.S @@ -1,553 +1,553 @@ /* sm3-avx-bmi2-amd64.S - Intel AVX/BMI2 accelerated SM3 transform function * Copyright (C) 2021 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64__ #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(USE_SM3) #include "asm-common-amd64.h" /* Context structure */ #define state_h0 0 #define state_h1 4 #define state_h2 8 #define state_h3 12 #define state_h4 16 #define state_h5 20 #define state_h6 24 #define state_h7 28 /* Constants */ .text .align 16 ELF(.type _gcry_sm3_avx2_consts,@object) _gcry_sm3_avx2_consts: .Lbe32mask: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ELF(.size _gcry_sm3_avx2_consts,.-_gcry_sm3_avx2_consts) /* Round constant macros */ #define K0 2043430169 /* 0x79cc4519 */ #define K1 -208106958 /* 0xf3988a32 */ #define K2 -416213915 /* 0xe7311465 */ #define K3 -832427829 /* 0xce6228cb */ #define K4 -1664855657 /* 0x9cc45197 */ #define K5 965255983 /* 0x3988a32f */ #define K6 1930511966 /* 0x7311465e */ #define K7 -433943364 /* 0xe6228cbc */ #define K8 -867886727 /* 0xcc451979 */ #define K9 -1735773453 /* 0x988a32f3 */ #define K10 823420391 /* 0x311465e7 */ #define K11 1646840782 /* 0x6228cbce */ #define K12 -1001285732 /* 0xc451979c */ #define K13 -2002571463 /* 0x88a32f39 */ #define K14 289824371 /* 0x11465e73 */ #define K15 579648742 /* 0x228cbce6 */ #define K16 -1651869049 /* 0x9d8a7a87 */ #define K17 991229199 /* 0x3b14f50f */ #define K18 1982458398 /* 0x7629ea1e */ #define K19 -330050500 /* 0xec53d43c */ #define K20 -660100999 /* 0xd8a7a879 */ #define K21 -1320201997 /* 0xb14f50f3 */ #define K22 1654563303 /* 0x629ea1e7 */ #define K23 -985840690 /* 0xc53d43ce */ #define K24 -1971681379 /* 0x8a7a879d */ #define K25 351604539 /* 0x14f50f3b */ #define K26 703209078 /* 0x29ea1e76 */ #define K27 1406418156 /* 0x53d43cec */ #define K28 -1482130984 /* 0xa7a879d8 */ #define K29 1330705329 /* 0x4f50f3b1 */ #define K30 -1633556638 /* 0x9ea1e762 */ #define K31 1027854021 /* 0x3d43cec5 */ #define K32 2055708042 /* 0x7a879d8a */ #define K33 -183551212 /* 0xf50f3b14 */ #define K34 -367102423 /* 0xea1e7629 */ #define K35 -734204845 /* 0xd43cec53 */ #define K36 -1468409689 /* 0xa879d8a7 */ #define K37 1358147919 /* 0x50f3b14f */ #define K38 -1578671458 /* 0xa1e7629e */ #define K39 1137624381 /* 0x43cec53d */ #define K40 -2019718534 /* 0x879d8a7a */ #define K41 255530229 /* 0x0f3b14f5 */ #define K42 511060458 /* 0x1e7629ea */ #define K43 1022120916 /* 0x3cec53d4 */ #define K44 2044241832 /* 0x79d8a7a8 */ #define K45 -206483632 /* 0xf3b14f50 */ #define K46 -412967263 /* 0xe7629ea1 */ #define K47 -825934525 /* 0xcec53d43 */ #define K48 -1651869049 /* 0x9d8a7a87 */ #define K49 991229199 /* 0x3b14f50f */ #define K50 1982458398 /* 0x7629ea1e */ #define K51 -330050500 /* 0xec53d43c */ #define K52 -660100999 /* 0xd8a7a879 */ #define K53 -1320201997 /* 0xb14f50f3 */ #define K54 1654563303 /* 0x629ea1e7 */ #define K55 -985840690 /* 0xc53d43ce */ #define K56 -1971681379 /* 0x8a7a879d */ #define K57 351604539 /* 0x14f50f3b */ #define K58 703209078 /* 0x29ea1e76 */ #define K59 1406418156 /* 0x53d43cec */ #define K60 -1482130984 /* 0xa7a879d8 */ #define K61 1330705329 /* 0x4f50f3b1 */ #define K62 -1633556638 /* 0x9ea1e762 */ #define K63 1027854021 /* 0x3d43cec5 */ /* Register macros */ #define RSTATE %rdi #define RDATA %rsi #define RNBLKS %rdx #define t0 %eax #define t1 %ebx #define t2 %ecx #define a %r8d #define b %r9d #define c %r10d #define d %r11d #define e %r12d #define f %r13d #define g %r14d #define h %r15d #define W0 %xmm0 #define W1 %xmm1 #define W2 %xmm2 #define W3 %xmm3 #define W4 %xmm4 #define W5 %xmm5 #define XTMP0 %xmm6 #define XTMP1 %xmm7 #define XTMP2 %xmm8 #define XTMP3 %xmm9 #define XTMP4 %xmm10 #define XTMP5 %xmm11 #define XTMP6 %xmm12 #define BSWAP_REG %xmm15 /* Stack structure */ #define STACK_W_SIZE (32 * 2 * 3) #define STACK_REG_SAVE_SIZE (64) #define STACK_W (0) #define STACK_REG_SAVE (STACK_W + STACK_W_SIZE) #define STACK_SIZE (STACK_REG_SAVE + STACK_REG_SAVE_SIZE) /* Instruction helpers. */ #define roll2(v, reg) \ roll $(v), reg; #define roll3mov(v, src, dst) \ movl src, dst; \ roll $(v), dst; #define roll3(v, src, dst) \ rorxl $(32-(v)), src, dst; #define addl2(a, out) \ leal (a, out), out; /* Round function macros. */ #define GG1(x, y, z, o, t) \ movl x, o; \ xorl y, o; \ xorl z, o; #define FF1(x, y, z, o, t) GG1(x, y, z, o, t) #define GG2(x, y, z, o, t) \ andnl z, x, o; \ movl y, t; \ andl x, t; \ addl2(t, o); #define FF2(x, y, z, o, t) \ movl y, o; \ xorl x, o; \ movl y, t; \ andl x, t; \ andl z, o; \ xorl t, o; #define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \ /* rol(a, 12) => t0 */ \ roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \ /* rol (t0 + e + t), 7) => t1 */ \ leal K##round(t0, e, 1), t1; \ roll2(7, t1); \ /* h + w1 => h */ \ addl wtype##_W1_ADDR(round, widx), h; \ /* h + t1 => h */ \ addl2(t1, h); \ /* t1 ^ t0 => t0 */ \ xorl t1, t0; \ /* w1w2 + d => d */ \ addl wtype##_W1W2_ADDR(round, widx), d; \ /* FF##i(a,b,c) => t1 */ \ FF##i(a, b, c, t1, t2); \ /* d + t1 => d */ \ addl2(t1, d); \ /* GG#i(e,f,g) => t2 */ \ GG##i(e, f, g, t2, t1); \ /* h + t2 => h */ \ addl2(t2, h); \ /* rol (f, 19) => f */ \ roll2(19, f); \ /* d + t0 => d */ \ addl2(t0, d); \ /* rol (b, 9) => b */ \ roll2(9, b); \ /* P0(h) => h */ \ roll3(9, h, t2); \ roll3(17, h, t1); \ xorl t2, h; \ xorl t1, h; #define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \ R(1, a, b, c, d, e, f, g, h, round, widx, wtype) #define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \ R(2, a, b, c, d, e, f, g, h, round, widx, wtype) /* Input expansion macros. */ /* Byte-swapped input address. */ #define IW_W_ADDR(round, widx, offs) \ (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp) /* Expanded input address. */ #define XW_W_ADDR(round, widx, offs) \ (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp) /* Rounds 1-12, byte-swapped input block addresses. */ #define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 0) #define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32) /* Rounds 1-12, expanded input block addresses. */ #define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) #define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32) /* Input block loading. */ #define LOAD_W_XMM_1() \ vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */ \ vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */ \ vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */ \ vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */\ vpshufb BSWAP_REG, XTMP0, XTMP0; \ vpshufb BSWAP_REG, XTMP1, XTMP1; \ vpshufb BSWAP_REG, XTMP2, XTMP2; \ vpshufb BSWAP_REG, XTMP3, XTMP3; \ vpxor XTMP0, XTMP1, XTMP4; \ vpxor XTMP1, XTMP2, XTMP5; \ vpxor XTMP2, XTMP3, XTMP6; \ leaq 64(RDATA), RDATA; \ vmovdqa XTMP0, IW_W1_ADDR(0, 0); \ vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \ vmovdqa XTMP1, IW_W1_ADDR(4, 0); \ vmovdqa XTMP5, IW_W1W2_ADDR(4, 0); #define LOAD_W_XMM_2() \ vmovdqa XTMP2, IW_W1_ADDR(8, 0); \ vmovdqa XTMP6, IW_W1W2_ADDR(8, 0); #define LOAD_W_XMM_3() \ vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */ \ vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */ \ vmovdqa XTMP1, W2; /* W2: xx, w6, w5, w4 */ \ vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */ \ vpalignr $8, XTMP2, XTMP3, W4; /* W4: xx, w12, w11, w10 */ \ vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */ /* Message scheduling. Note: 3 words per XMM register. */ #define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \ /* Load (w[i - 16]) => XTMP0 */ \ vpshufd $0b10111111, w0, XTMP0; \ vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */ \ /* Load (w[i - 13]) => XTMP1 */ \ vpshufd $0b10111111, w1, XTMP1; \ vpalignr $12, XTMP1, w2, XTMP1; \ /* w[i - 9] == w3 */ \ /* XMM3 ^ XTMP0 => XTMP0 */ \ vpxor w3, XTMP0, XTMP0; #define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \ /* w[i - 3] == w5 */ \ /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ vpslld $15, w5, XTMP2; \ vpsrld $(32-15), w5, XTMP3; \ vpxor XTMP2, XTMP3, XTMP3; \ vpxor XTMP3, XTMP0, XTMP0; \ /* rol(XTMP1, 7) => XTMP1 */ \ vpslld $7, XTMP1, XTMP5; \ vpsrld $(32-7), XTMP1, XTMP1; \ vpxor XTMP5, XTMP1, XTMP1; \ /* XMM4 ^ XTMP1 => XTMP1 */ \ vpxor w4, XTMP1, XTMP1; \ /* w[i - 6] == XMM4 */ \ /* P1(XTMP0) ^ XTMP1 => XMM0 */ \ vpslld $15, XTMP0, XTMP5; \ vpsrld $(32-15), XTMP0, XTMP6; \ vpslld $23, XTMP0, XTMP2; \ vpsrld $(32-23), XTMP0, XTMP3; \ vpxor XTMP0, XTMP1, XTMP1; \ vpxor XTMP6, XTMP5, XTMP5; \ vpxor XTMP3, XTMP2, XTMP2; \ vpxor XTMP2, XTMP5, XTMP5; \ vpxor XTMP5, XTMP1, w0; #define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \ /* W1 in XMM12 */ \ vpshufd $0b10111111, w4, XTMP4; \ vpalignr $12, XTMP4, w5, XTMP4; \ vmovdqa XTMP4, XW_W1_ADDR((round), 0); \ /* W1 ^ W2 => XTMP1 */ \ vpxor w0, XTMP4, XTMP1; \ vmovdqa XTMP1, XW_W1W2_ADDR((round), 0); /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * * unsigned int * _gcry_sm3_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data, * size_t nblks) */ .globl _gcry_sm3_transform_amd64_avx_bmi2 ELF(.type _gcry_sm3_transform_amd64_avx_bmi2,@function) .align 16 _gcry_sm3_transform_amd64_avx_bmi2: /* input: * %rdi: ctx, CTX * %rsi: data (64*nblks bytes) * %rdx: nblks */ CFI_STARTPROC(); vzeroupper; pushq %rbp; CFI_PUSH(%rbp); movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); movq %rdx, RNBLKS; subq $STACK_SIZE, %rsp; andq $(~63), %rsp; movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp); CFI_REL_OFFSET(%rbx, STACK_REG_SAVE + 0 * 8); movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp); CFI_REL_OFFSET(%r15, STACK_REG_SAVE + 1 * 8); movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp); CFI_REL_OFFSET(%r14, STACK_REG_SAVE + 2 * 8); movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp); CFI_REL_OFFSET(%r13, STACK_REG_SAVE + 3 * 8); movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp); CFI_REL_OFFSET(%r12, STACK_REG_SAVE + 4 * 8); vmovdqa .Lbe32mask rRIP, BSWAP_REG; /* Get the values of the chaining variables. */ movl state_h0(RSTATE), a; movl state_h1(RSTATE), b; movl state_h2(RSTATE), c; movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; movl state_h5(RSTATE), f; movl state_h6(RSTATE), g; movl state_h7(RSTATE), h; .align 16 .Loop: /* Load data part1. */ LOAD_W_XMM_1(); leaq -1(RNBLKS), RNBLKS; /* Transform 0-3 + Load data part2. */ R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2(); R1(d, a, b, c, h, e, f, g, 1, 1, IW); R1(c, d, a, b, g, h, e, f, 2, 2, IW); R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3(); /* Transform 4-7 + Precalc 12-14. */ R1(a, b, c, d, e, f, g, h, 4, 0, IW); R1(d, a, b, c, h, e, f, g, 5, 1, IW); R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5); R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5); /* Transform 8-11 + Precalc 12-17. */ R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5); R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0); R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0); R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0); /* Transform 12-14 + Precalc 18-20 */ R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1); R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1); R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1); /* Transform 15-17 + Precalc 21-23 */ R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2); R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2); R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2); /* Transform 18-20 + Precalc 24-26 */ R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3); R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3); R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3); /* Transform 21-23 + Precalc 27-29 */ R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4); R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4); R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4); /* Transform 24-26 + Precalc 30-32 */ R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5); R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5); R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5); /* Transform 27-29 + Precalc 33-35 */ R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0); R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0); R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0); /* Transform 30-32 + Precalc 36-38 */ R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1); R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1); R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1); /* Transform 33-35 + Precalc 39-41 */ R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2); R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2); R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2); /* Transform 36-38 + Precalc 42-44 */ R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3); R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3); R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3); /* Transform 39-41 + Precalc 45-47 */ R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4); R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4); R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4); /* Transform 42-44 + Precalc 48-50 */ R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5); R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5); R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5); /* Transform 45-47 + Precalc 51-53 */ R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0); R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0); R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0); /* Transform 48-50 + Precalc 54-56 */ R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1); R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1); R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1); /* Transform 51-53 + Precalc 57-59 */ R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2); R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2); R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2); /* Transform 54-56 + Precalc 60-62 */ R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3); R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3); R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3); /* Transform 57-59 + Precalc 63 */ R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4); R2(c, d, a, b, g, h, e, f, 58, 1, XW); R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4); /* Transform 60-62 + Precalc 63 */ R2(a, b, c, d, e, f, g, h, 60, 0, XW); R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4); R2(c, d, a, b, g, h, e, f, 62, 2, XW); /* Transform 63 */ R2(b, c, d, a, f, g, h, e, 63, 0, XW); /* Update the chaining variables. */ xorl state_h0(RSTATE), a; xorl state_h1(RSTATE), b; xorl state_h2(RSTATE), c; xorl state_h3(RSTATE), d; movl a, state_h0(RSTATE); movl b, state_h1(RSTATE); movl c, state_h2(RSTATE); movl d, state_h3(RSTATE); xorl state_h4(RSTATE), e; xorl state_h5(RSTATE), f; xorl state_h6(RSTATE), g; xorl state_h7(RSTATE), h; movl e, state_h4(RSTATE); movl f, state_h5(RSTATE); movl g, state_h6(RSTATE); movl h, state_h7(RSTATE); cmpq $0, RNBLKS; jne .Loop; vzeroall; movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx; CFI_RESTORE(%rbx); movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15; CFI_RESTORE(%r15); movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14; CFI_RESTORE(%r14); movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13; CFI_RESTORE(%r13); movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12; CFI_RESTORE(%r12); vmovdqa %xmm0, IW_W1_ADDR(0, 0); vmovdqa %xmm0, IW_W1W2_ADDR(0, 0); vmovdqa %xmm0, IW_W1_ADDR(4, 0); vmovdqa %xmm0, IW_W1W2_ADDR(4, 0); vmovdqa %xmm0, IW_W1_ADDR(8, 0); vmovdqa %xmm0, IW_W1W2_ADDR(8, 0); xorl %eax, %eax; /* stack burned */ leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm3_transform_amd64_avx_bmi2, .-_gcry_sm3_transform_amd64_avx_bmi2;) #endif #endif diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S index 3610b98c..7a99e070 100644 --- a/cipher/sm4-aesni-avx-amd64.S +++ b/cipher/sm4-aesni-avx-amd64.S @@ -1,987 +1,987 @@ /* sm4-avx-aesni-amd64.S - AES-NI/AVX implementation of SM4 cipher * * Copyright (C) 2020 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* Based on SM4 AES-NI work by Markku-Juhani O. Saarinen at: * https://github.com/mjosaarinen/sm4ni */ #include #ifdef __x86_64 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) #include "asm-common-amd64.h" /* vector registers */ #define RX0 %xmm0 #define RX1 %xmm1 #define MASK_4BIT %xmm2 #define RTMP0 %xmm3 #define RTMP1 %xmm4 #define RTMP2 %xmm5 #define RTMP3 %xmm6 #define RTMP4 %xmm7 #define RA0 %xmm8 #define RA1 %xmm9 #define RA2 %xmm10 #define RA3 %xmm11 #define RB0 %xmm12 #define RB1 %xmm13 #define RB2 %xmm14 #define RB3 %xmm15 #define RNOT %xmm0 #define RBSWAP %xmm1 /********************************************************************** helper macros **********************************************************************/ /* Transpose four 32-bit words between 128-bit vectors. */ #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; /* post-SubByte transform. */ #define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ vpand x, mask4bit, tmp0; \ vpandn x, mask4bit, x; \ vpsrld $4, x, x; \ \ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; /* post-SubByte transform. Note: x has been XOR'ed with mask4bit by * 'vaeslastenc' instruction. */ #define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ vpandn mask4bit, x, tmp0; \ vpsrld $4, x, x; \ vpand x, mask4bit, x; \ \ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; /********************************************************************** 4-way && 8-way SM4 with AES-NI and AVX **********************************************************************/ .text .align 16 /* * Following four affine transform look-up tables are from work by * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni * * These allow exposing SM4 S-Box from AES SubByte. */ /* pre-SubByte affine transform, from SM4 field to AES field. */ .Lpre_tf_lo_s: .quad 0x9197E2E474720701, 0xC7C1B4B222245157 .Lpre_tf_hi_s: .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 /* post-SubByte affine transform, from AES field to SM4 field. */ .Lpost_tf_lo_s: .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 .Lpost_tf_hi_s: .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF /* For isolating SubBytes from AESENCLAST, inverse shift row */ .Linv_shift_row: .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 /* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ .Linv_shift_row_rol_8: .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 /* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ .Linv_shift_row_rol_16: .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 /* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ .Linv_shift_row_rol_24: .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 /* For input word byte-swap */ .Lbswap32_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .align 4 /* 4-bit mask */ .L0f0f0f0f: .long 0x0f0f0f0f .align 8 .globl _gcry_sm4_aesni_avx_expand_key ELF(.type _gcry_sm4_aesni_avx_expand_key,@function;) _gcry_sm4_aesni_avx_expand_key: /* input: * %rdi: 128-bit key * %rsi: rkey_enc * %rdx: rkey_dec * %rcx: fk array * %r8: ck array */ CFI_STARTPROC(); vmovd 0*4(%rdi), RA0; vmovd 1*4(%rdi), RA1; vmovd 2*4(%rdi), RA2; vmovd 3*4(%rdi), RA3; vmovdqa .Lbswap32_mask rRIP, RTMP2; vpshufb RTMP2, RA0, RA0; vpshufb RTMP2, RA1, RA1; vpshufb RTMP2, RA2, RA2; vpshufb RTMP2, RA3, RA3; vmovd 0*4(%rcx), RB0; vmovd 1*4(%rcx), RB1; vmovd 2*4(%rcx), RB2; vmovd 3*4(%rcx), RB3; vpxor RB0, RA0, RA0; vpxor RB1, RA1, RA1; vpxor RB2, RA2, RA2; vpxor RB3, RA3, RA3; vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; vmovdqa .Lpre_tf_hi_s rRIP, RB0; vmovdqa .Lpost_tf_lo_s rRIP, RB1; vmovdqa .Lpost_tf_hi_s rRIP, RB2; vmovdqa .Linv_shift_row rRIP, RB3; #define ROUND(round, s0, s1, s2, s3) \ vbroadcastss (4*(round))(%r8), RX0; \ vpxor s1, RX0, RX0; \ vpxor s2, RX0, RX0; \ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ \ /* sbox, non-linear part */ \ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ vaesenclast MASK_4BIT, RX0, RX0; \ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ \ /* linear part */ \ vpshufb RB3, RX0, RX0; \ vpxor RX0, s0, s0; /* s0 ^ x */ \ vpslld $13, RX0, RTMP0; \ vpsrld $19, RX0, RTMP1; \ vpslld $23, RX0, RTMP2; \ vpsrld $9, RX0, RTMP3; \ vpxor RTMP0, RTMP1, RTMP1; \ vpxor RTMP2, RTMP3, RTMP3; \ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,13) */ \ vpxor RTMP3, s0, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */ leaq (32*4)(%r8), %rax; leaq (32*4)(%rdx), %rdx; .align 16 .Lroundloop_expand_key: leaq (-4*4)(%rdx), %rdx; ROUND(0, RA0, RA1, RA2, RA3); ROUND(1, RA1, RA2, RA3, RA0); ROUND(2, RA2, RA3, RA0, RA1); ROUND(3, RA3, RA0, RA1, RA2); leaq (4*4)(%r8), %r8; vmovd RA0, (0*4)(%rsi); vmovd RA1, (1*4)(%rsi); vmovd RA2, (2*4)(%rsi); vmovd RA3, (3*4)(%rsi); vmovd RA0, (3*4)(%rdx); vmovd RA1, (2*4)(%rdx); vmovd RA2, (1*4)(%rdx); vmovd RA3, (0*4)(%rdx); leaq (4*4)(%rsi), %rsi; cmpq %rax, %r8; jne .Lroundloop_expand_key; #undef ROUND vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_expand_key,.-_gcry_sm4_aesni_avx_expand_key;) .align 8 ELF(.type sm4_aesni_avx_crypt_blk1_4,@function;) sm4_aesni_avx_crypt_blk1_4: /* input: * %rdi: round key array, CTX * %rsi: dst (1..4 blocks) * %rdx: src (1..4 blocks) * %rcx: num blocks (1..4) */ CFI_STARTPROC(); vmovdqu 0*16(%rdx), RA0; vmovdqa RA0, RA1; vmovdqa RA0, RA2; vmovdqa RA0, RA3; cmpq $2, %rcx; jb .Lblk4_load_input_done; vmovdqu 1*16(%rdx), RA1; je .Lblk4_load_input_done; vmovdqu 2*16(%rdx), RA2; cmpq $3, %rcx; je .Lblk4_load_input_done; vmovdqu 3*16(%rdx), RA3; .Lblk4_load_input_done: vmovdqa .Lbswap32_mask rRIP, RTMP2; vpshufb RTMP2, RA0, RA0; vpshufb RTMP2, RA1, RA1; vpshufb RTMP2, RA2, RA2; vpshufb RTMP2, RA3, RA3; vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; vmovdqa .Lpre_tf_hi_s rRIP, RB0; vmovdqa .Lpost_tf_lo_s rRIP, RB1; vmovdqa .Lpost_tf_hi_s rRIP, RB2; vmovdqa .Linv_shift_row rRIP, RB3; vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2; vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3; transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); #define ROUND(round, s0, s1, s2, s3) \ vbroadcastss (4*(round))(%rdi), RX0; \ vpxor s1, RX0, RX0; \ vpxor s2, RX0, RX0; \ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ \ /* sbox, non-linear part */ \ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ vaesenclast MASK_4BIT, RX0, RX0; \ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ \ /* linear part */ \ vpshufb RB3, RX0, RTMP0; \ vpxor RTMP0, s0, s0; /* s0 ^ x */ \ vpshufb RTMP2, RX0, RTMP1; \ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ vpshufb RTMP3, RX0, RTMP1; \ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ vpslld $2, RTMP0, RTMP1; \ vpsrld $30, RTMP0, RTMP0; \ vpxor RTMP0, s0, s0; \ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ leaq (32*4)(%rdi), %rax; .align 16 .Lroundloop_blk4: ROUND(0, RA0, RA1, RA2, RA3); ROUND(1, RA1, RA2, RA3, RA0); ROUND(2, RA2, RA3, RA0, RA1); ROUND(3, RA3, RA0, RA1, RA2); leaq (4*4)(%rdi), %rdi; cmpq %rax, %rdi; jne .Lroundloop_blk4; #undef ROUND vmovdqa .Lbswap128_mask rRIP, RTMP2; transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); vpshufb RTMP2, RA0, RA0; vpshufb RTMP2, RA1, RA1; vpshufb RTMP2, RA2, RA2; vpshufb RTMP2, RA3, RA3; vmovdqu RA0, 0*16(%rsi); cmpq $2, %rcx; jb .Lblk4_store_output_done; vmovdqu RA1, 1*16(%rsi); je .Lblk4_store_output_done; vmovdqu RA2, 2*16(%rsi); cmpq $3, %rcx; je .Lblk4_store_output_done; vmovdqu RA3, 3*16(%rsi); .Lblk4_store_output_done: vzeroall; xorl %eax, %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size sm4_aesni_avx_crypt_blk1_4,.-sm4_aesni_avx_crypt_blk1_4;) .align 8 ELF(.type __sm4_crypt_blk8,@function;) __sm4_crypt_blk8: /* input: * %rdi: round key array, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel * ciphertext blocks * output: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks */ CFI_STARTPROC(); vmovdqa .Lbswap32_mask rRIP, RTMP2; vpshufb RTMP2, RA0, RA0; vpshufb RTMP2, RA1, RA1; vpshufb RTMP2, RA2, RA2; vpshufb RTMP2, RA3, RA3; vpshufb RTMP2, RB0, RB0; vpshufb RTMP2, RB1, RB1; vpshufb RTMP2, RB2, RB2; vpshufb RTMP2, RB3, RB3; vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); #define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ vbroadcastss (4*(round))(%rdi), RX0; \ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \ vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \ vmovdqa RX0, RX1; \ vpxor s1, RX0, RX0; \ vpxor s2, RX0, RX0; \ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \ vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \ vpxor r1, RX1, RX1; \ vpxor r2, RX1, RX1; \ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ \ /* sbox, non-linear part */ \ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ vmovdqa .Linv_shift_row rRIP, RTMP4; \ vaesenclast MASK_4BIT, RX0, RX0; \ vaesenclast MASK_4BIT, RX1, RX1; \ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ \ /* linear part */ \ vpshufb RTMP4, RX0, RTMP0; \ vpxor RTMP0, s0, s0; /* s0 ^ x */ \ vpshufb RTMP4, RX1, RTMP2; \ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \ vpxor RTMP2, r0, r0; /* r0 ^ x */ \ vpshufb RTMP4, RX0, RTMP1; \ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ vpshufb RTMP4, RX1, RTMP3; \ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ vpshufb RTMP4, RX0, RTMP1; \ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ vpshufb RTMP4, RX1, RTMP3; \ vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ vpshufb RTMP4, RX0, RTMP1; \ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ vpslld $2, RTMP0, RTMP1; \ vpsrld $30, RTMP0, RTMP0; \ vpxor RTMP0, s0, s0; \ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ vpshufb RTMP4, RX1, RTMP3; \ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ vpslld $2, RTMP2, RTMP3; \ vpsrld $30, RTMP2, RTMP2; \ vpxor RTMP2, r0, r0; \ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ leaq (32*4)(%rdi), %rax; .align 16 .Lroundloop_blk8: ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); leaq (4*4)(%rdi), %rdi; cmpq %rax, %rdi; jne .Lroundloop_blk8; #undef ROUND vmovdqa .Lbswap128_mask rRIP, RTMP2; transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); vpshufb RTMP2, RA0, RA0; vpshufb RTMP2, RA1, RA1; vpshufb RTMP2, RA2, RA2; vpshufb RTMP2, RA3, RA3; vpshufb RTMP2, RB0, RB0; vpshufb RTMP2, RB1, RB1; vpshufb RTMP2, RB2, RB2; vpshufb RTMP2, RB3, RB3; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;) .align 8 .globl _gcry_sm4_aesni_avx_crypt_blk1_8 ELF(.type _gcry_sm4_aesni_avx_crypt_blk1_8,@function;) _gcry_sm4_aesni_avx_crypt_blk1_8: /* input: * %rdi: round key array, CTX * %rsi: dst (1..8 blocks) * %rdx: src (1..8 blocks) * %rcx: num blocks (1..8) */ CFI_STARTPROC(); cmpq $5, %rcx; jb sm4_aesni_avx_crypt_blk1_4; vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2; vmovdqu (3 * 16)(%rdx), RA3; vmovdqu (4 * 16)(%rdx), RB0; vmovdqa RB0, RB1; vmovdqa RB0, RB2; vmovdqa RB0, RB3; je .Lblk8_load_input_done; vmovdqu (5 * 16)(%rdx), RB1; cmpq $7, %rcx; jb .Lblk8_load_input_done; vmovdqu (6 * 16)(%rdx), RB2; je .Lblk8_load_input_done; vmovdqu (7 * 16)(%rdx), RB3; .Lblk8_load_input_done: call __sm4_crypt_blk8; cmpq $6, %rcx; vmovdqu RA0, (0 * 16)(%rsi); vmovdqu RA1, (1 * 16)(%rsi); vmovdqu RA2, (2 * 16)(%rsi); vmovdqu RA3, (3 * 16)(%rsi); vmovdqu RB0, (4 * 16)(%rsi); jb .Lblk8_store_output_done; vmovdqu RB1, (5 * 16)(%rsi); je .Lblk8_store_output_done; vmovdqu RB2, (6 * 16)(%rsi); cmpq $7, %rcx; je .Lblk8_store_output_done; vmovdqu RB3, (7 * 16)(%rsi); .Lblk8_store_output_done: vzeroall; xorl %eax, %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_crypt_blk1_8,.-_gcry_sm4_aesni_avx_crypt_blk1_8;) .align 8 .globl _gcry_sm4_aesni_avx_ctr_enc ELF(.type _gcry_sm4_aesni_avx_ctr_enc,@function;) _gcry_sm4_aesni_avx_ctr_enc: /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); /* load IV and byteswap */ vmovdqu (%rcx), RA0; vmovdqa .Lbswap128_mask rRIP, RBSWAP; vpshufb RBSWAP, RA0, RTMP0; /* be => le */ vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */ #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; /* construct IVs */ inc_le128(RTMP0, RNOT, RTMP2); /* +1 */ vpshufb RBSWAP, RTMP0, RA1; inc_le128(RTMP0, RNOT, RTMP2); /* +2 */ vpshufb RBSWAP, RTMP0, RA2; inc_le128(RTMP0, RNOT, RTMP2); /* +3 */ vpshufb RBSWAP, RTMP0, RA3; inc_le128(RTMP0, RNOT, RTMP2); /* +4 */ vpshufb RBSWAP, RTMP0, RB0; inc_le128(RTMP0, RNOT, RTMP2); /* +5 */ vpshufb RBSWAP, RTMP0, RB1; inc_le128(RTMP0, RNOT, RTMP2); /* +6 */ vpshufb RBSWAP, RTMP0, RB2; inc_le128(RTMP0, RNOT, RTMP2); /* +7 */ vpshufb RBSWAP, RTMP0, RB3; inc_le128(RTMP0, RNOT, RTMP2); /* +8 */ vpshufb RBSWAP, RTMP0, RTMP1; /* store new IV */ vmovdqu RTMP1, (%rcx); call __sm4_crypt_blk8; vpxor (0 * 16)(%rdx), RA0, RA0; vpxor (1 * 16)(%rdx), RA1, RA1; vpxor (2 * 16)(%rdx), RA2, RA2; vpxor (3 * 16)(%rdx), RA3, RA3; vpxor (4 * 16)(%rdx), RB0, RB0; vpxor (5 * 16)(%rdx), RB1, RB1; vpxor (6 * 16)(%rdx), RB2, RB2; vpxor (7 * 16)(%rdx), RB3, RB3; vmovdqu RA0, (0 * 16)(%rsi); vmovdqu RA1, (1 * 16)(%rsi); vmovdqu RA2, (2 * 16)(%rsi); vmovdqu RA3, (3 * 16)(%rsi); vmovdqu RB0, (4 * 16)(%rsi); vmovdqu RB1, (5 * 16)(%rsi); vmovdqu RB2, (6 * 16)(%rsi); vmovdqu RB3, (7 * 16)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;) .align 8 .globl _gcry_sm4_aesni_avx_cbc_dec ELF(.type _gcry_sm4_aesni_avx_cbc_dec,@function;) _gcry_sm4_aesni_avx_cbc_dec: /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv */ CFI_STARTPROC(); vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2; vmovdqu (3 * 16)(%rdx), RA3; vmovdqu (4 * 16)(%rdx), RB0; vmovdqu (5 * 16)(%rdx), RB1; vmovdqu (6 * 16)(%rdx), RB2; vmovdqu (7 * 16)(%rdx), RB3; call __sm4_crypt_blk8; vmovdqu (7 * 16)(%rdx), RNOT; vpxor (%rcx), RA0, RA0; vpxor (0 * 16)(%rdx), RA1, RA1; vpxor (1 * 16)(%rdx), RA2, RA2; vpxor (2 * 16)(%rdx), RA3, RA3; vpxor (3 * 16)(%rdx), RB0, RB0; vpxor (4 * 16)(%rdx), RB1, RB1; vpxor (5 * 16)(%rdx), RB2, RB2; vpxor (6 * 16)(%rdx), RB3, RB3; vmovdqu RNOT, (%rcx); /* store new IV */ vmovdqu RA0, (0 * 16)(%rsi); vmovdqu RA1, (1 * 16)(%rsi); vmovdqu RA2, (2 * 16)(%rsi); vmovdqu RA3, (3 * 16)(%rsi); vmovdqu RB0, (4 * 16)(%rsi); vmovdqu RB1, (5 * 16)(%rsi); vmovdqu RB2, (6 * 16)(%rsi); vmovdqu RB3, (7 * 16)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_cbc_dec,.-_gcry_sm4_aesni_avx_cbc_dec;) .align 8 .globl _gcry_sm4_aesni_avx_cfb_dec ELF(.type _gcry_sm4_aesni_avx_cfb_dec,@function;) _gcry_sm4_aesni_avx_cfb_dec: /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv */ CFI_STARTPROC(); /* Load input */ vmovdqu (%rcx), RA0; vmovdqu 0 * 16(%rdx), RA1; vmovdqu 1 * 16(%rdx), RA2; vmovdqu 2 * 16(%rdx), RA3; vmovdqu 3 * 16(%rdx), RB0; vmovdqu 4 * 16(%rdx), RB1; vmovdqu 5 * 16(%rdx), RB2; vmovdqu 6 * 16(%rdx), RB3; /* Update IV */ vmovdqu 7 * 16(%rdx), RNOT; vmovdqu RNOT, (%rcx); call __sm4_crypt_blk8; vpxor (0 * 16)(%rdx), RA0, RA0; vpxor (1 * 16)(%rdx), RA1, RA1; vpxor (2 * 16)(%rdx), RA2, RA2; vpxor (3 * 16)(%rdx), RA3, RA3; vpxor (4 * 16)(%rdx), RB0, RB0; vpxor (5 * 16)(%rdx), RB1, RB1; vpxor (6 * 16)(%rdx), RB2, RB2; vpxor (7 * 16)(%rdx), RB3, RB3; vmovdqu RA0, (0 * 16)(%rsi); vmovdqu RA1, (1 * 16)(%rsi); vmovdqu RA2, (2 * 16)(%rsi); vmovdqu RA3, (3 * 16)(%rsi); vmovdqu RB0, (4 * 16)(%rsi); vmovdqu RB1, (5 * 16)(%rsi); vmovdqu RB2, (6 * 16)(%rsi); vmovdqu RB3, (7 * 16)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_cfb_dec,.-_gcry_sm4_aesni_avx_cfb_dec;) .align 8 .globl _gcry_sm4_aesni_avx_ocb_enc ELF(.type _gcry_sm4_aesni_avx_ocb_enc,@function;) _gcry_sm4_aesni_avx_ocb_enc: /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0; vmovdqu (%r8), RTMP1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ vmovdqu (n * 16)(%rdx), xreg; \ vpxor (lreg), RTMP0, RTMP0; \ vpxor xreg, RTMP1, RTMP1; \ vpxor RTMP0, xreg, xreg; \ vmovdqu RTMP0, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0, (%rcx); vmovdqu RTMP1, (%r8); movq (0 * 8)(%rsp), %r10; CFI_RESTORE(%r10); movq (1 * 8)(%rsp), %r11; CFI_RESTORE(%r11); movq (2 * 8)(%rsp), %r12; CFI_RESTORE(%r12); movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r13); call __sm4_crypt_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor (0 * 16)(%rsi), RA0, RA0; vpxor (1 * 16)(%rsi), RA1, RA1; vpxor (2 * 16)(%rsi), RA2, RA2; vpxor (3 * 16)(%rsi), RA3, RA3; vpxor (4 * 16)(%rsi), RB0, RB0; vpxor (5 * 16)(%rsi), RB1, RB1; vpxor (6 * 16)(%rsi), RB2, RB2; vpxor (7 * 16)(%rsi), RB3, RB3; vmovdqu RA0, (0 * 16)(%rsi); vmovdqu RA1, (1 * 16)(%rsi); vmovdqu RA2, (2 * 16)(%rsi); vmovdqu RA3, (3 * 16)(%rsi); vmovdqu RB0, (4 * 16)(%rsi); vmovdqu RB1, (5 * 16)(%rsi); vmovdqu RB2, (6 * 16)(%rsi); vmovdqu RB3, (7 * 16)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_enc,.-_gcry_sm4_aesni_avx_ocb_enc;) .align 8 .globl _gcry_sm4_aesni_avx_ocb_dec ELF(.type _gcry_sm4_aesni_avx_ocb_dec,@function;) _gcry_sm4_aesni_avx_ocb_dec: /* input: * %rdi: round key array, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rcx), RTMP0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ vmovdqu (n * 16)(%rdx), xreg; \ vpxor (lreg), RTMP0, RTMP0; \ vpxor RTMP0, xreg, xreg; \ vmovdqu RTMP0, (n * 16)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0, (%rcx); movq (0 * 8)(%rsp), %r10; CFI_RESTORE(%r10); movq (1 * 8)(%rsp), %r11; CFI_RESTORE(%r11); movq (2 * 8)(%rsp), %r12; CFI_RESTORE(%r12); movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r13); call __sm4_crypt_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vmovdqu (%r8), RTMP0; vpxor (0 * 16)(%rsi), RA0, RA0; vpxor (1 * 16)(%rsi), RA1, RA1; vpxor (2 * 16)(%rsi), RA2, RA2; vpxor (3 * 16)(%rsi), RA3, RA3; vpxor (4 * 16)(%rsi), RB0, RB0; vpxor (5 * 16)(%rsi), RB1, RB1; vpxor (6 * 16)(%rsi), RB2, RB2; vpxor (7 * 16)(%rsi), RB3, RB3; /* Checksum_i = Checksum_{i-1} xor P_i */ vmovdqu RA0, (0 * 16)(%rsi); vpxor RA0, RTMP0, RTMP0; vmovdqu RA1, (1 * 16)(%rsi); vpxor RA1, RTMP0, RTMP0; vmovdqu RA2, (2 * 16)(%rsi); vpxor RA2, RTMP0, RTMP0; vmovdqu RA3, (3 * 16)(%rsi); vpxor RA3, RTMP0, RTMP0; vmovdqu RB0, (4 * 16)(%rsi); vpxor RB0, RTMP0, RTMP0; vmovdqu RB1, (5 * 16)(%rsi); vpxor RB1, RTMP0, RTMP0; vmovdqu RB2, (6 * 16)(%rsi); vpxor RB2, RTMP0, RTMP0; vmovdqu RB3, (7 * 16)(%rsi); vpxor RB3, RTMP0, RTMP0; vmovdqu RTMP0, (%r8); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_dec,.-_gcry_sm4_aesni_avx_ocb_dec;) .align 8 .globl _gcry_sm4_aesni_avx_ocb_auth ELF(.type _gcry_sm4_aesni_avx_ocb_auth,@function;) _gcry_sm4_aesni_avx_ocb_auth: /* input: * %rdi: round key array, CTX * %rsi: abuf (8 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[8]) */ CFI_STARTPROC(); subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rdx), RTMP0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, lreg, xreg) \ vmovdqu (n * 16)(%rsi), xreg; \ vpxor (lreg), RTMP0, RTMP0; \ vpxor RTMP0, xreg, xreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, RA0); OCB_INPUT(1, %r11, RA1); OCB_INPUT(2, %r12, RA2); OCB_INPUT(3, %r13, RA3); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(4, %r10, RB0); OCB_INPUT(5, %r11, RB1); OCB_INPUT(6, %r12, RB2); OCB_INPUT(7, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0, (%rdx); movq (0 * 8)(%rsp), %r10; CFI_RESTORE(%r10); movq (1 * 8)(%rsp), %r11; CFI_RESTORE(%r11); movq (2 * 8)(%rsp), %r12; CFI_RESTORE(%r12); movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r13); call __sm4_crypt_blk8; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vmovdqu (%rcx), RTMP0; vpxor RB0, RA0, RA0; vpxor RB1, RA1, RA1; vpxor RB2, RA2, RA2; vpxor RB3, RA3, RA3; vpxor RTMP0, RA3, RA3; vpxor RA2, RA0, RA0; vpxor RA3, RA1, RA1; vpxor RA1, RA0, RA0; vmovdqu RA0, (%rcx); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_auth,.-_gcry_sm4_aesni_avx_ocb_auth;) #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S index 6e46c0dc..7a8b9558 100644 --- a/cipher/sm4-aesni-avx2-amd64.S +++ b/cipher/sm4-aesni-avx2-amd64.S @@ -1,851 +1,851 @@ /* sm4-avx2-amd64.S - AVX2 implementation of SM4 cipher * * Copyright (C) 2020 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* Based on SM4 AES-NI work by Markku-Juhani O. Saarinen at: * https://github.com/mjosaarinen/sm4ni */ #include #ifdef __x86_64 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) #include "asm-common-amd64.h" /* vector registers */ #define RX0 %ymm0 #define RX1 %ymm1 #define MASK_4BIT %ymm2 #define RTMP0 %ymm3 #define RTMP1 %ymm4 #define RTMP2 %ymm5 #define RTMP3 %ymm6 #define RTMP4 %ymm7 #define RA0 %ymm8 #define RA1 %ymm9 #define RA2 %ymm10 #define RA3 %ymm11 #define RB0 %ymm12 #define RB1 %ymm13 #define RB2 %ymm14 #define RB3 %ymm15 #define RNOT %ymm0 #define RBSWAP %ymm1 #define RX0x %xmm0 #define RX1x %xmm1 #define MASK_4BITx %xmm2 #define RNOTx %xmm0 #define RBSWAPx %xmm1 #define RTMP0x %xmm3 #define RTMP1x %xmm4 #define RTMP2x %xmm5 #define RTMP3x %xmm6 #define RTMP4x %xmm7 /********************************************************************** helper macros **********************************************************************/ /* Transpose four 32-bit words between 128-bit vector lanes. */ #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; /* post-SubByte transform. */ #define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ vpand x, mask4bit, tmp0; \ vpandn x, mask4bit, x; \ vpsrld $4, x, x; \ \ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; /* post-SubByte transform. Note: x has been XOR'ed with mask4bit by * 'vaeslastenc' instruction. */ #define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ vpandn mask4bit, x, tmp0; \ vpsrld $4, x, x; \ vpand x, mask4bit, x; \ \ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; /********************************************************************** 16-way SM4 with AES-NI and AVX **********************************************************************/ .text .align 16 /* * Following four affine transform look-up tables are from work by * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni * * These allow exposing SM4 S-Box from AES SubByte. */ /* pre-SubByte affine transform, from SM4 field to AES field. */ .Lpre_tf_lo_s: .quad 0x9197E2E474720701, 0xC7C1B4B222245157 .Lpre_tf_hi_s: .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 /* post-SubByte affine transform, from AES field to SM4 field. */ .Lpost_tf_lo_s: .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 .Lpost_tf_hi_s: .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF /* For isolating SubBytes from AESENCLAST, inverse shift row */ .Linv_shift_row: .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 /* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ .Linv_shift_row_rol_8: .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 /* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ .Linv_shift_row_rol_16: .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 /* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ .Linv_shift_row_rol_24: .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c /* For CTR-mode IV byteswap */ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 /* For input word byte-swap */ .Lbswap32_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .align 4 /* 4-bit mask */ .L0f0f0f0f: .long 0x0f0f0f0f .align 8 ELF(.type __sm4_crypt_blk16,@function;) __sm4_crypt_blk16: /* input: * %rdi: ctx, CTX * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * plaintext blocks * output: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * ciphertext blocks */ CFI_STARTPROC(); vbroadcasti128 .Lbswap32_mask rRIP, RTMP2; vpshufb RTMP2, RA0, RA0; vpshufb RTMP2, RA1, RA1; vpshufb RTMP2, RA2, RA2; vpshufb RTMP2, RA3, RA3; vpshufb RTMP2, RB0, RB0; vpshufb RTMP2, RB1, RB1; vpshufb RTMP2, RB2, RB2; vpshufb RTMP2, RB3, RB3; vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT; transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); #define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ vpbroadcastd (4*(round))(%rdi), RX0; \ vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \ vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \ vmovdqa RX0, RX1; \ vpxor s1, RX0, RX0; \ vpxor s2, RX0, RX0; \ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \ vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \ vpxor r1, RX1, RX1; \ vpxor r2, RX1, RX1; \ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ \ /* sbox, non-linear part */ \ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ vextracti128 $1, RX0, RTMP4x; \ vextracti128 $1, RX1, RTMP0x; \ vaesenclast MASK_4BITx, RX0x, RX0x; \ vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \ vaesenclast MASK_4BITx, RX1x, RX1x; \ vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \ vinserti128 $1, RTMP4x, RX0, RX0; \ vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \ vinserti128 $1, RTMP0x, RX1, RX1; \ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ \ /* linear part */ \ vpshufb RTMP4, RX0, RTMP0; \ vpxor RTMP0, s0, s0; /* s0 ^ x */ \ vpshufb RTMP4, RX1, RTMP2; \ vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \ vpxor RTMP2, r0, r0; /* r0 ^ x */ \ vpshufb RTMP4, RX0, RTMP1; \ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ vpshufb RTMP4, RX1, RTMP3; \ vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ vpshufb RTMP4, RX0, RTMP1; \ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ vpshufb RTMP4, RX1, RTMP3; \ vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ vpshufb RTMP4, RX0, RTMP1; \ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ vpslld $2, RTMP0, RTMP1; \ vpsrld $30, RTMP0, RTMP0; \ vpxor RTMP0, s0, s0; \ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ vpshufb RTMP4, RX1, RTMP3; \ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ vpslld $2, RTMP2, RTMP3; \ vpsrld $30, RTMP2, RTMP2; \ vpxor RTMP2, r0, r0; \ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ leaq (32*4)(%rdi), %rax; .align 16 .Lroundloop_blk8: ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); leaq (4*4)(%rdi), %rdi; cmpq %rax, %rdi; jne .Lroundloop_blk8; #undef ROUND vbroadcasti128 .Lbswap128_mask rRIP, RTMP2; transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); vpshufb RTMP2, RA0, RA0; vpshufb RTMP2, RA1, RA1; vpshufb RTMP2, RA2, RA2; vpshufb RTMP2, RA3, RA3; vpshufb RTMP2, RB0, RB0; vpshufb RTMP2, RB1, RB1; vpshufb RTMP2, RB2, RB2; vpshufb RTMP2, RB3, RB3; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; .align 8 .globl _gcry_sm4_aesni_avx2_ctr_enc ELF(.type _gcry_sm4_aesni_avx2_ctr_enc,@function;) _gcry_sm4_aesni_avx2_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); movq 8(%rcx), %rax; bswapq %rax; vzeroupper; vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ /* load IV and byteswap */ vmovdqu (%rcx), RTMP4x; vpshufb RTMP3x, RTMP4x, RTMP4x; vmovdqa RTMP4x, RTMP0x; inc_le128(RTMP4x, RNOTx, RTMP1x); vinserti128 $1, RTMP4x, RTMP0, RTMP0; vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */ /* check need for handling 64-bit overflow and carry */ cmpq $(0xffffffffffffffff - 16), %rax; ja .Lhandle_ctr_carry; /* construct IVs */ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */ vpshufb RTMP3, RTMP0, RA1; vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */ vpshufb RTMP3, RTMP0, RA2; vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */ vpshufb RTMP3, RTMP0, RA3; vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */ vpshufb RTMP3, RTMP0, RB0; vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */ vpshufb RTMP3, RTMP0, RB1; vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */ vpshufb RTMP3, RTMP0, RB2; vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */ vpshufb RTMP3, RTMP0, RB3; vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ vpshufb RTMP3x, RTMP0x, RTMP0x; jmp .Lctr_carry_done; .Lhandle_ctr_carry: /* construct IVs */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */ inc_le128(RTMP0, RNOT, RTMP1); vextracti128 $1, RTMP0, RTMP0x; vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ .align 4 .Lctr_carry_done: /* store new IV */ vmovdqu RTMP0x, (%rcx); call __sm4_crypt_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; vpxor (1 * 32)(%rdx), RA1, RA1; vpxor (2 * 32)(%rdx), RA2, RA2; vpxor (3 * 32)(%rdx), RA3, RA3; vpxor (4 * 32)(%rdx), RB0, RB0; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RB2, RB2; vpxor (7 * 32)(%rdx), RB3, RB3; vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA3, (3 * 32)(%rsi); vmovdqu RB0, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB3, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;) .align 8 .globl _gcry_sm4_aesni_avx2_cbc_dec ELF(.type _gcry_sm4_aesni_avx2_cbc_dec,@function;) _gcry_sm4_aesni_avx2_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; vmovdqu (0 * 32)(%rdx), RA0; vmovdqu (1 * 32)(%rdx), RA1; vmovdqu (2 * 32)(%rdx), RA2; vmovdqu (3 * 32)(%rdx), RA3; vmovdqu (4 * 32)(%rdx), RB0; vmovdqu (5 * 32)(%rdx), RB1; vmovdqu (6 * 32)(%rdx), RB2; vmovdqu (7 * 32)(%rdx), RB3; call __sm4_crypt_blk16; vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RNOT; vpxor RNOT, RA0, RA0; vpxor (0 * 32 + 16)(%rdx), RA1, RA1; vpxor (1 * 32 + 16)(%rdx), RA2, RA2; vpxor (2 * 32 + 16)(%rdx), RA3, RA3; vpxor (3 * 32 + 16)(%rdx), RB0, RB0; vpxor (4 * 32 + 16)(%rdx), RB1, RB1; vpxor (5 * 32 + 16)(%rdx), RB2, RB2; vpxor (6 * 32 + 16)(%rdx), RB3, RB3; vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); /* store new IV */ vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA3, (3 * 32)(%rsi); vmovdqu RB0, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB3, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;) .align 8 .globl _gcry_sm4_aesni_avx2_cfb_dec ELF(.type _gcry_sm4_aesni_avx2_cfb_dec,@function;) _gcry_sm4_aesni_avx2_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; /* Load input */ vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RA0; vmovdqu (0 * 32 + 16)(%rdx), RA1; vmovdqu (1 * 32 + 16)(%rdx), RA2; vmovdqu (2 * 32 + 16)(%rdx), RA3; vmovdqu (3 * 32 + 16)(%rdx), RB0; vmovdqu (4 * 32 + 16)(%rdx), RB1; vmovdqu (5 * 32 + 16)(%rdx), RB2; vmovdqu (6 * 32 + 16)(%rdx), RB3; /* Update IV */ vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); call __sm4_crypt_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; vpxor (1 * 32)(%rdx), RA1, RA1; vpxor (2 * 32)(%rdx), RA2, RA2; vpxor (3 * 32)(%rdx), RA3, RA3; vpxor (4 * 32)(%rdx), RB0, RB0; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RB2, RB2; vpxor (7 * 32)(%rdx), RB3, RB3; vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA3, (3 * 32)(%rsi); vmovdqu RB0, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB3, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;) .align 8 .globl _gcry_sm4_aesni_avx2_ocb_enc ELF(.type _gcry_sm4_aesni_avx2_ocb_enc,@function;) _gcry_sm4_aesni_avx2_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; vmovdqu (%r8), RTMP1x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RTMP1, RTMP1; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vextracti128 $1, RTMP1, RNOTx; vmovdqu RTMP0x, (%rcx); vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%r8); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __sm4_crypt_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor (0 * 32)(%rsi), RA0, RA0; vpxor (1 * 32)(%rsi), RA1, RA1; vpxor (2 * 32)(%rsi), RA2, RA2; vpxor (3 * 32)(%rsi), RA3, RA3; vpxor (4 * 32)(%rsi), RB0, RB0; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RB2, RB2; vpxor (7 * 32)(%rsi), RB3, RB3; vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RA1, (1 * 32)(%rsi); vmovdqu RA2, (2 * 32)(%rsi); vmovdqu RA3, (3 * 32)(%rsi); vmovdqu RB0, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RB2, (6 * 32)(%rsi); vmovdqu RB3, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;) .align 8 .globl _gcry_sm4_aesni_avx2_ocb_dec ELF(.type _gcry_sm4_aesni_avx2_ocb_dec,@function;) _gcry_sm4_aesni_avx2_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0x, (%rcx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __sm4_crypt_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vmovdqu (%r8), RTMP1x; vpxor (0 * 32)(%rsi), RA0, RA0; vpxor (1 * 32)(%rsi), RA1, RA1; vpxor (2 * 32)(%rsi), RA2, RA2; vpxor (3 * 32)(%rsi), RA3, RA3; vpxor (4 * 32)(%rsi), RB0, RB0; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RB2, RB2; vpxor (7 * 32)(%rsi), RB3, RB3; /* Checksum_i = Checksum_{i-1} xor P_i */ vmovdqu RA0, (0 * 32)(%rsi); vpxor RA0, RTMP1, RTMP1; vmovdqu RA1, (1 * 32)(%rsi); vpxor RA1, RTMP1, RTMP1; vmovdqu RA2, (2 * 32)(%rsi); vpxor RA2, RTMP1, RTMP1; vmovdqu RA3, (3 * 32)(%rsi); vpxor RA3, RTMP1, RTMP1; vmovdqu RB0, (4 * 32)(%rsi); vpxor RB0, RTMP1, RTMP1; vmovdqu RB1, (5 * 32)(%rsi); vpxor RB1, RTMP1, RTMP1; vmovdqu RB2, (6 * 32)(%rsi); vpxor RB2, RTMP1, RTMP1; vmovdqu RB3, (7 * 32)(%rsi); vpxor RB3, RTMP1, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%r8); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;) .align 8 .globl _gcry_sm4_aesni_avx2_ocb_auth ELF(.type _gcry_sm4_aesni_avx2_ocb_auth,@function;) _gcry_sm4_aesni_avx2_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (16 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rdx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rsi), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RA1); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(2, %r10, %r11, RA2); OCB_INPUT(3, %r12, %r13, RA3); movq (8 * 8)(%r8), %r10; movq (9 * 8)(%r8), %r11; movq (10 * 8)(%r8), %r12; movq (11 * 8)(%r8), %r13; OCB_INPUT(4, %r10, %r11, RB0); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r8), %r10; movq (13 * 8)(%r8), %r11; movq (14 * 8)(%r8), %r12; movq (15 * 8)(%r8), %r13; OCB_INPUT(6, %r10, %r11, RB2); OCB_INPUT(7, %r12, %r13, RB3); #undef OCB_INPUT vmovdqu RTMP0x, (%rdx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __sm4_crypt_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor RA0, RB0, RA0; vpxor RA1, RB1, RA1; vpxor RA2, RB2, RA2; vpxor RA3, RB3, RA3; vpxor RA1, RA0, RA0; vpxor RA3, RA2, RA2; vpxor RA2, RA0, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor (%rcx), RTMP1x, RTMP1x; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%rcx); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_auth,.-_gcry_sm4_aesni_avx2_ocb_auth;) #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S index 3cb73431..a7a60553 100644 --- a/cipher/twofish-amd64.S +++ b/cipher/twofish-amd64.S @@ -1,1184 +1,1184 @@ /* twofish-amd64.S - AMD64 assembly implementation of Twofish cipher * * Copyright (C) 2013-2015 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) #include "asm-common-amd64.h" .text /* structure of TWOFISH_context: */ #define s0 0 #define s1 ((s0) + 4 * 256) #define s2 ((s1) + 4 * 256) #define s3 ((s2) + 4 * 256) #define w ((s3) + 4 * 256) #define k ((w) + 4 * 8) /* register macros */ #define CTX %rdi #define RA %rax #define RB %rbx #define RC %rcx #define RD %rdx #define RAd %eax #define RBd %ebx #define RCd %ecx #define RDd %edx #define RAbl %al #define RBbl %bl #define RCbl %cl #define RDbl %dl #define RAbh %ah #define RBbh %bh #define RCbh %ch #define RDbh %dh #define RX %r8 #define RY %r9 #define RXd %r8d #define RYd %r9d #define RT0 %rsi #define RT1 %rbp #define RT2 %r10 #define RT3 %r11 #define RT0d %esi #define RT1d %ebp #define RT2d %r10d #define RT3d %r11d /*********************************************************************** * AMD64 assembly implementation of the Twofish cipher ***********************************************************************/ #define enc_g1_2(a, b, x, y) \ movzbl b ## bl, RT3d; \ movzbl b ## bh, RT1d; \ movzbl a ## bl, RT2d; \ movzbl a ## bh, RT0d; \ rorl $16, b ## d; \ rorl $16, a ## d; \ movl s1(CTX, RT3, 4), RYd; \ movzbl b ## bl, RT3d; \ movl s0(CTX, RT2, 4), RXd; \ movzbl a ## bl, RT2d; \ xorl s2(CTX, RT1, 4), RYd; \ movzbl b ## bh, RT1d; \ xorl s1(CTX, RT0, 4), RXd; \ movzbl a ## bh, RT0d; \ rorl $16, b ## d; \ rorl $16, a ## d; \ xorl s3(CTX, RT3, 4), RYd; \ xorl s2(CTX, RT2, 4), RXd; \ xorl s0(CTX, RT1, 4), RYd; \ xorl s3(CTX, RT0, 4), RXd; #define dec_g1_2(a, b, x, y) \ movzbl a ## bl, RT2d; \ movzbl a ## bh, RT0d; \ movzbl b ## bl, RT3d; \ movzbl b ## bh, RT1d; \ rorl $16, a ## d; \ rorl $16, b ## d; \ movl s0(CTX, RT2, 4), RXd; \ movzbl a ## bl, RT2d; \ movl s1(CTX, RT3, 4), RYd; \ movzbl b ## bl, RT3d; \ xorl s1(CTX, RT0, 4), RXd; \ movzbl a ## bh, RT0d; \ xorl s2(CTX, RT1, 4), RYd; \ movzbl b ## bh, RT1d; \ rorl $16, a ## d; \ rorl $16, b ## d; \ xorl s2(CTX, RT2, 4), RXd; \ xorl s3(CTX, RT3, 4), RYd; \ xorl s3(CTX, RT0, 4), RXd; \ xorl s0(CTX, RT1, 4), RYd; #define encrypt_round(ra, rb, rc, rd, n) \ enc_g1_2(##ra, ##rb, RX, RY); \ \ leal (RXd, RYd, 2), RT0d; \ addl RYd, RXd; \ addl (k + 8 * (n) + 4)(CTX), RT0d; \ roll $1, rd ## d; \ addl (k + 8 * (n))(CTX), RXd; \ xorl RT0d, rd ## d; \ xorl RXd, rc ## d; \ rorl $1, rc ## d; #define decrypt_round(ra, rb, rc, rd, n) \ dec_g1_2(##ra, ##rb, RX, RY); \ \ leal (RXd, RYd, 2), RT0d; \ addl RYd, RXd; \ addl (k + 8 * (n) + 4)(CTX), RT0d; \ roll $1, rc ## d; \ addl (k + 8 * (n))(CTX), RXd; \ xorl RXd, rc ## d; \ xorl RT0d, rd ## d; \ rorl $1, rd ## d; #define encrypt_cycle(a, b, c, d, nc) \ encrypt_round(##a, ##b, ##c, ##d, (nc) * 2); \ encrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1); #define decrypt_cycle(a, b, c, d, nc) \ decrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1); \ decrypt_round(##a, ##b, ##c, ##d, (nc) * 2); #define inpack(in, n, x, m) \ movl (4 * (n))(in), x; \ xorl (w + 4 * (m))(CTX), x; #define outunpack(out, n, x, m) \ xorl (w + 4 * (m))(CTX), x; \ movl x, (4 * (n))(out); .align 8 .globl _gcry_twofish_amd64_encrypt_block ELF(.type _gcry_twofish_amd64_encrypt_block,@function;) _gcry_twofish_amd64_encrypt_block: /* input: * %rdi: context, CTX * %rsi: dst * %rdx: src */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(3 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(3 * 8); movq %rsi, (0 * 8)(%rsp); movq %rbp, (1 * 8)(%rsp); movq %rbx, (2 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 1 * 8); CFI_REL_OFFSET(%rbx, 2 * 8); movq %rdx, RX; inpack(RX, 0, RAd, 0); inpack(RX, 1, RBd, 1); inpack(RX, 2, RCd, 2); inpack(RX, 3, RDd, 3); encrypt_cycle(RA, RB, RC, RD, 0); encrypt_cycle(RA, RB, RC, RD, 1); encrypt_cycle(RA, RB, RC, RD, 2); encrypt_cycle(RA, RB, RC, RD, 3); encrypt_cycle(RA, RB, RC, RD, 4); encrypt_cycle(RA, RB, RC, RD, 5); encrypt_cycle(RA, RB, RC, RD, 6); encrypt_cycle(RA, RB, RC, RD, 7); movq (0 * 8)(%rsp), RX; /*dst*/ outunpack(RX, 0, RCd, 4); outunpack(RX, 1, RDd, 5); outunpack(RX, 2, RAd, 6); outunpack(RX, 3, RBd, 7); movq (2 * 8)(%rsp), %rbx; movq (1 * 8)(%rsp), %rbp; CFI_RESTORE(%rbx); CFI_RESTORE(%rbp); addq $(3 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-3 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;) .align 8 .globl _gcry_twofish_amd64_decrypt_block ELF(.type _gcry_twofish_amd64_decrypt_block,@function;) _gcry_twofish_amd64_decrypt_block: /* input: * %rdi: context, CTX * %rsi: dst * %rdx: src */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(3 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(3 * 8); movq %rsi, (0 * 8)(%rsp); movq %rbp, (1 * 8)(%rsp); movq %rbx, (2 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 1 * 8); CFI_REL_OFFSET(%rbx, 2 * 8); movq %rdx, RX; inpack(RX, 0, RCd, 4); inpack(RX, 1, RDd, 5); inpack(RX, 2, RAd, 6); inpack(RX, 3, RBd, 7); decrypt_cycle(RA, RB, RC, RD, 7); decrypt_cycle(RA, RB, RC, RD, 6); decrypt_cycle(RA, RB, RC, RD, 5); decrypt_cycle(RA, RB, RC, RD, 4); decrypt_cycle(RA, RB, RC, RD, 3); decrypt_cycle(RA, RB, RC, RD, 2); decrypt_cycle(RA, RB, RC, RD, 1); decrypt_cycle(RA, RB, RC, RD, 0); movq (0 * 8)(%rsp), RX; /*dst*/ outunpack(RX, 0, RAd, 0); outunpack(RX, 1, RBd, 1); outunpack(RX, 2, RCd, 2); outunpack(RX, 3, RDd, 3); movq (2 * 8)(%rsp), %rbx; movq (1 * 8)(%rsp), %rbp; CFI_RESTORE(%rbx); CFI_RESTORE(%rbp); addq $(3 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-3 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;) #undef CTX #undef RA #undef RB #undef RC #undef RD #undef RAd #undef RBd #undef RCd #undef RDd #undef RAbl #undef RBbl #undef RCbl #undef RDbl #undef RAbh #undef RBbh #undef RCbh #undef RDbh #undef RX #undef RY #undef RXd #undef RYd #undef RT0 #undef RT1 #undef RT2 #undef RT3 #undef RT0d #undef RT1d #undef RT2d #undef RT3d /*********************************************************************** * AMD64 assembly implementation of the Twofish cipher, 3-way parallel ***********************************************************************/ #define CTX %rdi #define RIO %rdx #define RAB0 %rax #define RAB1 %rbx #define RAB2 %rcx #define RAB0d %eax #define RAB1d %ebx #define RAB2d %ecx #define RAB0bh %ah #define RAB1bh %bh #define RAB2bh %ch #define RAB0bl %al #define RAB1bl %bl #define RAB2bl %cl #define RCD0 %r8 #define RCD1 %r9 #define RCD2 %r10 #define RCD0d %r8d #define RCD1d %r9d #define RCD2d %r10d #define RX0 %rbp #define RX1 %r11 #define RX2 %r12 #define RX0d %ebp #define RX1d %r11d #define RX2d %r12d #define RY0 %r13 #define RY1 %r14 #define RY2 %r15 #define RY0d %r13d #define RY1d %r14d #define RY2d %r15d #define RT0 %rdx #define RT1 %rsi #define RT0d %edx #define RT1d %esi #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ movzbl ab ## bl, tmp2 ## d; \ movzbl ab ## bh, tmp1 ## d; \ rorq $(rot), ab; \ op1##l T0(CTX, tmp2, 4), dst ## d; \ op2##l T1(CTX, tmp1, 4), dst ## d; /* * Combined G1 & G2 function. Reordered with help of rotates to have moves * at beginning. */ #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ /* G1,1 && G2,1 */ \ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ \ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ \ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ \ /* G1,2 && G2,2 */ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ movq ab ## 0, RT0; \ movq cd ## 0, ab ## 0; \ movq RT0, cd ## 0; \ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ movq ab ## 1, RT0; \ movq cd ## 1, ab ## 1; \ movq RT0, cd ## 1; \ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ movq ab ## 2, RT0; \ movq cd ## 2, ab ## 2; \ movq RT0, cd ## 2; #define enc_round_end(ab, x, y, n) \ addl y ## d, x ## d; \ addl x ## d, y ## d; \ addl k+4*(2*(n))(CTX), x ## d; \ xorl ab ## d, x ## d; \ addl k+4*(2*(n)+1)(CTX), y ## d; \ shrq $32, ab; \ roll $1, ab ## d; \ xorl y ## d, ab ## d; \ shlq $32, ab; \ rorl $1, x ## d; \ orq x, ab; #define dec_round_end(ba, x, y, n) \ addl y ## d, x ## d; \ addl x ## d, y ## d; \ addl k+4*(2*(n))(CTX), x ## d; \ addl k+4*(2*(n)+1)(CTX), y ## d; \ xorl ba ## d, y ## d; \ shrq $32, ba; \ roll $1, ba ## d; \ xorl x ## d, ba ## d; \ shlq $32, ba; \ rorl $1, y ## d; \ orq y, ba; #define encrypt_round3(ab, cd, n) \ g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ \ enc_round_end(ab ## 0, RX0, RY0, n); \ enc_round_end(ab ## 1, RX1, RY1, n); \ enc_round_end(ab ## 2, RX2, RY2, n); #define decrypt_round3(ba, dc, n) \ g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ \ dec_round_end(ba ## 0, RX0, RY0, n); \ dec_round_end(ba ## 1, RX1, RY1, n); \ dec_round_end(ba ## 2, RX2, RY2, n); #define encrypt_cycle3(ab, cd, n) \ encrypt_round3(ab, cd, n*2); \ encrypt_round3(ab, cd, (n*2)+1); #define decrypt_cycle3(ba, dc, n) \ decrypt_round3(ba, dc, (n*2)+1); \ decrypt_round3(ba, dc, (n*2)); #define inpack3(xy, m) \ xorq w+4*m(CTX), xy ## 0; \ xorq w+4*m(CTX), xy ## 1; \ xorq w+4*m(CTX), xy ## 2; #define outunpack3(xy, m) \ xorq w+4*m(CTX), xy ## 0; \ xorq w+4*m(CTX), xy ## 1; \ xorq w+4*m(CTX), xy ## 2; #define inpack_enc3() \ inpack3(RAB, 0); \ inpack3(RCD, 2); #define outunpack_enc3() \ outunpack3(RAB, 6); \ outunpack3(RCD, 4); #define inpack_dec3() \ inpack3(RAB, 4); \ rorq $32, RAB0; \ rorq $32, RAB1; \ rorq $32, RAB2; \ inpack3(RCD, 6); \ rorq $32, RCD0; \ rorq $32, RCD1; \ rorq $32, RCD2; #define outunpack_dec3() \ rorq $32, RCD0; \ rorq $32, RCD1; \ rorq $32, RCD2; \ outunpack3(RCD, 0); \ rorq $32, RAB0; \ rorq $32, RAB1; \ rorq $32, RAB2; \ outunpack3(RAB, 2); .align 8 ELF(.type __twofish_enc_blk3,@function;) __twofish_enc_blk3: /* input: * %rdi: ctx, CTX * RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three plaintext blocks * output: * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks */ CFI_STARTPROC(); inpack_enc3(); encrypt_cycle3(RAB, RCD, 0); encrypt_cycle3(RAB, RCD, 1); encrypt_cycle3(RAB, RCD, 2); encrypt_cycle3(RAB, RCD, 3); encrypt_cycle3(RAB, RCD, 4); encrypt_cycle3(RAB, RCD, 5); encrypt_cycle3(RAB, RCD, 6); encrypt_cycle3(RAB, RCD, 7); outunpack_enc3(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;) .align 8 ELF(.type __twofish_dec_blk3,@function;) __twofish_dec_blk3: /* input: * %rdi: ctx, CTX * RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three ciphertext blocks * output: * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks */ CFI_STARTPROC(); inpack_dec3(); decrypt_cycle3(RAB, RCD, 7); decrypt_cycle3(RAB, RCD, 6); decrypt_cycle3(RAB, RCD, 5); decrypt_cycle3(RAB, RCD, 4); decrypt_cycle3(RAB, RCD, 3); decrypt_cycle3(RAB, RCD, 2); decrypt_cycle3(RAB, RCD, 1); decrypt_cycle3(RAB, RCD, 0); outunpack_dec3(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;) .align 8 .globl _gcry_twofish_amd64_ctr_enc ELF(.type _gcry_twofish_amd64_ctr_enc,@function;) _gcry_twofish_amd64_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 0 * 8); CFI_REL_OFFSET(%rbx, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); CFI_REL_OFFSET(%r14, 4 * 8); CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, (7 * 8)(%rsp); movq %rcx, RX0; /* load IV and byteswap */ movq 8(RX0), RT0; movq 0(RX0), RT1; movq RT0, RCD0; movq RT1, RAB0; bswapq RT0; bswapq RT1; /* construct IVs */ movq RT0, RCD1; movq RT1, RAB1; movq RT0, RCD2; movq RT1, RAB2; addq $1, RCD1; adcq $0, RAB1; bswapq RCD1; bswapq RAB1; addq $2, RCD2; adcq $0, RAB2; bswapq RCD2; bswapq RAB2; addq $3, RT0; adcq $0, RT1; bswapq RT0; bswapq RT1; /* store new IV */ movq RT0, 8(RX0); movq RT1, 0(RX0); call __twofish_enc_blk3; movq (7 * 8)(%rsp), RX0; /*src*/ movq (6 * 8)(%rsp), RX1; /*dst*/ /* XOR key-stream with plaintext */ xorq (0 * 8)(RX0), RCD0; xorq (1 * 8)(RX0), RAB0; xorq (2 * 8)(RX0), RCD1; xorq (3 * 8)(RX0), RAB1; xorq (4 * 8)(RX0), RCD2; xorq (5 * 8)(RX0), RAB2; movq RCD0, (0 * 8)(RX1); movq RAB0, (1 * 8)(RX1); movq RCD1, (2 * 8)(RX1); movq RAB1, (3 * 8)(RX1); movq RCD2, (4 * 8)(RX1); movq RAB2, (5 * 8)(RX1); movq (0 * 8)(%rsp), %rbp; movq (1 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; CFI_RESTORE(%rbp); CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); addq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;) .align 8 .globl _gcry_twofish_amd64_cbc_dec ELF(.type _gcry_twofish_amd64_cbc_dec,@function;) _gcry_twofish_amd64_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: iv (128bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(9 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(9 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 0 * 8); CFI_REL_OFFSET(%rbx, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); CFI_REL_OFFSET(%r14, 4 * 8); CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, (7 * 8)(%rsp); movq %rcx, (8 * 8)(%rsp); movq %rdx, RX0; /* load input */ movq (0 * 8)(RX0), RAB0; movq (1 * 8)(RX0), RCD0; movq (2 * 8)(RX0), RAB1; movq (3 * 8)(RX0), RCD1; movq (4 * 8)(RX0), RAB2; movq (5 * 8)(RX0), RCD2; call __twofish_dec_blk3; movq (8 * 8)(%rsp), RT0; /*iv*/ movq (7 * 8)(%rsp), RX0; /*src*/ movq (6 * 8)(%rsp), RX1; /*dst*/ movq (4 * 8)(RX0), RY0; movq (5 * 8)(RX0), RY1; xorq (0 * 8)(RT0), RCD0; xorq (1 * 8)(RT0), RAB0; xorq (0 * 8)(RX0), RCD1; xorq (1 * 8)(RX0), RAB1; xorq (2 * 8)(RX0), RCD2; xorq (3 * 8)(RX0), RAB2; movq RY0, (0 * 8)(RT0); movq RY1, (1 * 8)(RT0); movq RCD0, (0 * 8)(RX1); movq RAB0, (1 * 8)(RX1); movq RCD1, (2 * 8)(RX1); movq RAB1, (3 * 8)(RX1); movq RCD2, (4 * 8)(RX1); movq RAB2, (5 * 8)(RX1); movq (0 * 8)(%rsp), %rbp; movq (1 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; CFI_RESTORE(%rbp); CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); addq $(9 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-9 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;) .align 8 .globl _gcry_twofish_amd64_cfb_dec ELF(.type _gcry_twofish_amd64_cfb_dec,@function;) _gcry_twofish_amd64_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: iv (128bit) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 0 * 8); CFI_REL_OFFSET(%rbx, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); CFI_REL_OFFSET(%r14, 4 * 8); CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, (7 * 8)(%rsp); movq %rdx, RX0; movq %rcx, RX1; /* load input */ movq (0 * 8)(RX1), RAB0; movq (1 * 8)(RX1), RCD0; movq (0 * 8)(RX0), RAB1; movq (1 * 8)(RX0), RCD1; movq (2 * 8)(RX0), RAB2; movq (3 * 8)(RX0), RCD2; /* Update IV */ movq (4 * 8)(RX0), RY0; movq (5 * 8)(RX0), RY1; movq RY0, (0 * 8)(RX1); movq RY1, (1 * 8)(RX1); call __twofish_enc_blk3; movq (7 * 8)(%rsp), RX0; /*src*/ movq (6 * 8)(%rsp), RX1; /*dst*/ xorq (0 * 8)(RX0), RCD0; xorq (1 * 8)(RX0), RAB0; xorq (2 * 8)(RX0), RCD1; xorq (3 * 8)(RX0), RAB1; xorq (4 * 8)(RX0), RCD2; xorq (5 * 8)(RX0), RAB2; movq RCD0, (0 * 8)(RX1); movq RAB0, (1 * 8)(RX1); movq RCD1, (2 * 8)(RX1); movq RAB1, (3 * 8)(RX1); movq RCD2, (4 * 8)(RX1); movq RAB2, (5 * 8)(RX1); movq (0 * 8)(%rsp), %rbp; movq (1 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; CFI_RESTORE(%rbp); CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); addq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;) .align 8 .globl _gcry_twofish_amd64_ocb_enc ELF(.type _gcry_twofish_amd64_ocb_enc,@function;) _gcry_twofish_amd64_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[3]) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_6 subq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 0 * 8); CFI_REL_OFFSET(%rbx, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); CFI_REL_OFFSET(%r14, 4 * 8); CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, RX0; movq %rcx, RX1; movq %r8, RX2; movq %r9, RY0; movq %rsi, RY1; /* Load offset */ movq (0 * 8)(RX1), RT0; movq (1 * 8)(RX1), RT1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq (RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (0 * 8)(RX0), RAB0; movq (1 * 8)(RX0), RCD0; /* Store Offset_i */ movq RT0, (0 * 8)(RY1); movq RT1, (1 * 8)(RY1); /* Checksum_i = Checksum_{i-1} xor P_i */ xor RAB0, (0 * 8)(RX2); xor RCD0, (1 * 8)(RX2); /* PX_i = P_i xor Offset_i */ xorq RT0, RAB0; xorq RT1, RCD0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq 8(RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (2 * 8)(RX0), RAB1; movq (3 * 8)(RX0), RCD1; /* Store Offset_i */ movq RT0, (2 * 8)(RY1); movq RT1, (3 * 8)(RY1); /* Checksum_i = Checksum_{i-1} xor P_i */ xor RAB1, (0 * 8)(RX2); xor RCD1, (1 * 8)(RX2); /* PX_i = P_i xor Offset_i */ xorq RT0, RAB1; xorq RT1, RCD1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq 16(RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (4 * 8)(RX0), RAB2; movq (5 * 8)(RX0), RCD2; /* Store Offset_i */ movq RT0, (4 * 8)(RY1); movq RT1, (5 * 8)(RY1); /* Checksum_i = Checksum_{i-1} xor P_i */ xor RAB2, (0 * 8)(RX2); xor RCD2, (1 * 8)(RX2); /* PX_i = P_i xor Offset_i */ xorq RT0, RAB2; xorq RT1, RCD2; /* Store offset */ movq RT0, (0 * 8)(RX1); movq RT1, (1 * 8)(RX1); /* CX_i = ENCIPHER(K, PX_i) */ call __twofish_enc_blk3; movq (6 * 8)(%rsp), RX1; /*dst*/ /* C_i = CX_i xor Offset_i */ xorq RCD0, (0 * 8)(RX1); xorq RAB0, (1 * 8)(RX1); xorq RCD1, (2 * 8)(RX1); xorq RAB1, (3 * 8)(RX1); xorq RCD2, (4 * 8)(RX1); xorq RAB2, (5 * 8)(RX1); movq (0 * 8)(%rsp), %rbp; movq (1 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; CFI_RESTORE(%rbp); CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); addq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;) .align 8 .globl _gcry_twofish_amd64_ocb_dec ELF(.type _gcry_twofish_amd64_ocb_dec,@function;) _gcry_twofish_amd64_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (3 blocks) * %rdx: src (3 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[3]) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_6 subq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 0 * 8); CFI_REL_OFFSET(%rbx, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); CFI_REL_OFFSET(%r14, 4 * 8); CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %r8, (7 * 8)(%rsp); movq %rdx, RX0; movq %rcx, RX1; movq %r9, RY0; movq %rsi, RY1; /* Load offset */ movq (0 * 8)(RX1), RT0; movq (1 * 8)(RX1), RT1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq (RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (0 * 8)(RX0), RAB0; movq (1 * 8)(RX0), RCD0; /* Store Offset_i */ movq RT0, (0 * 8)(RY1); movq RT1, (1 * 8)(RY1); /* CX_i = C_i xor Offset_i */ xorq RT0, RAB0; xorq RT1, RCD0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq 8(RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (2 * 8)(RX0), RAB1; movq (3 * 8)(RX0), RCD1; /* Store Offset_i */ movq RT0, (2 * 8)(RY1); movq RT1, (3 * 8)(RY1); /* PX_i = P_i xor Offset_i */ xorq RT0, RAB1; xorq RT1, RCD1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq 16(RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (4 * 8)(RX0), RAB2; movq (5 * 8)(RX0), RCD2; /* Store Offset_i */ movq RT0, (4 * 8)(RY1); movq RT1, (5 * 8)(RY1); /* PX_i = P_i xor Offset_i */ xorq RT0, RAB2; xorq RT1, RCD2; /* Store offset */ movq RT0, (0 * 8)(RX1); movq RT1, (1 * 8)(RX1); /* PX_i = DECIPHER(K, CX_i) */ call __twofish_dec_blk3; movq (7 * 8)(%rsp), RX2; /*checksum*/ movq (6 * 8)(%rsp), RX1; /*dst*/ /* Load checksum */ movq (0 * 8)(RX2), RT0; movq (1 * 8)(RX2), RT1; /* P_i = PX_i xor Offset_i */ xorq RCD0, (0 * 8)(RX1); xorq RAB0, (1 * 8)(RX1); xorq RCD1, (2 * 8)(RX1); xorq RAB1, (3 * 8)(RX1); xorq RCD2, (4 * 8)(RX1); xorq RAB2, (5 * 8)(RX1); /* Checksum_i = Checksum_{i-1} xor P_i */ xorq (0 * 8)(RX1), RT0; xorq (1 * 8)(RX1), RT1; xorq (2 * 8)(RX1), RT0; xorq (3 * 8)(RX1), RT1; xorq (4 * 8)(RX1), RT0; xorq (5 * 8)(RX1), RT1; /* Store checksum */ movq RT0, (0 * 8)(RX2); movq RT1, (1 * 8)(RX2); movq (0 * 8)(%rsp), %rbp; movq (1 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; CFI_RESTORE(%rbp); CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); addq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;) .align 8 .globl _gcry_twofish_amd64_ocb_auth ELF(.type _gcry_twofish_amd64_ocb_auth,@function;) _gcry_twofish_amd64_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (3 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[3]) */ CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 subq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); CFI_REL_OFFSET(%rbp, 0 * 8); CFI_REL_OFFSET(%rbx, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); CFI_REL_OFFSET(%r14, 4 * 8); CFI_REL_OFFSET(%r15, 5 * 8); movq %rcx, (6 * 8)(%rsp); movq %rsi, RX0; movq %rdx, RX1; movq %r8, RY0; /* Load offset */ movq (0 * 8)(RX1), RT0; movq (1 * 8)(RX1), RT1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq (RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (0 * 8)(RX0), RAB0; movq (1 * 8)(RX0), RCD0; /* PX_i = P_i xor Offset_i */ xorq RT0, RAB0; xorq RT1, RCD0; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq 8(RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (2 * 8)(RX0), RAB1; movq (3 * 8)(RX0), RCD1; /* PX_i = P_i xor Offset_i */ xorq RT0, RAB1; xorq RT1, RCD1; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ movq 16(RY0), RY2; xorq (0 * 8)(RY2), RT0; xorq (1 * 8)(RY2), RT1; movq (4 * 8)(RX0), RAB2; movq (5 * 8)(RX0), RCD2; /* PX_i = P_i xor Offset_i */ xorq RT0, RAB2; xorq RT1, RCD2; /* Store offset */ movq RT0, (0 * 8)(RX1); movq RT1, (1 * 8)(RX1); /* C_i = ENCIPHER(K, PX_i) */ call __twofish_enc_blk3; movq (6 * 8)(%rsp), RX1; /*checksum*/ /* Checksum_i = C_i xor Checksum_i */ xorq RCD0, RCD1; xorq RAB0, RAB1; xorq RCD1, RCD2; xorq RAB1, RAB2; xorq RCD2, (0 * 8)(RX1); xorq RAB2, (1 * 8)(RX1); movq (0 * 8)(%rsp), %rbp; movq (1 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; CFI_RESTORE(%rbp); CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); addq $(8 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;) #endif /*USE_TWOFISH*/ #endif /*__x86_64*/ diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S index 74cad355..930ac792 100644 --- a/cipher/twofish-avx2-amd64.S +++ b/cipher/twofish-avx2-amd64.S @@ -1,1048 +1,1048 @@ /* twofish-avx2-amd64.S - AMD64/AVX2 assembly implementation of Twofish cipher * * Copyright (C) 2013-2017 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \ defined(ENABLE_AVX2_SUPPORT) #include "asm-common-amd64.h" .text /* structure of TWOFISH_context: */ #define s0 0 #define s1 ((s0) + 4 * 256) #define s2 ((s1) + 4 * 256) #define s3 ((s2) + 4 * 256) #define w ((s3) + 4 * 256) #define k ((w) + 4 * 8) /* register macros */ #define CTX %rdi #define RROUND %rbp #define RROUNDd %ebp #define RS0 CTX #define RS1 %r8 #define RS2 %r9 #define RS3 %r10 #define RK %r11 #define RW %rax #define RA0 %ymm8 #define RB0 %ymm9 #define RC0 %ymm10 #define RD0 %ymm11 #define RA1 %ymm12 #define RB1 %ymm13 #define RC1 %ymm14 #define RD1 %ymm15 /* temp regs */ #define RX0 %ymm0 #define RY0 %ymm1 #define RX1 %ymm2 #define RY1 %ymm3 #define RT0 %ymm4 #define RIDX %ymm5 #define RX0x %xmm0 #define RY0x %xmm1 #define RX1x %xmm2 #define RY1x %xmm3 #define RT0x %xmm4 #define RIDXx %xmm5 #define RTMP0 RX0 #define RTMP0x RX0x #define RTMP1 RX1 #define RTMP1x RX1x #define RTMP2 RY0 #define RTMP2x RY0x #define RTMP3 RY1 #define RTMP3x RY1x #define RTMP4 RIDX #define RTMP4x RIDXx /* vpgatherdd mask and '-1' */ #define RNOT %ymm6 #define RNOTx %xmm6 /* byte mask, (-1 >> 24) */ #define RBYTE %ymm7 /********************************************************************** 16-way AVX2 twofish **********************************************************************/ #define init_round_constants() \ vpcmpeqd RNOT, RNOT, RNOT; \ leaq k(CTX), RK; \ leaq w(CTX), RW; \ vpsrld $24, RNOT, RBYTE; \ leaq s1(CTX), RS1; \ leaq s2(CTX), RS2; \ leaq s3(CTX), RS3; \ #define g16(ab, rs0, rs1, rs2, rs3, xy) \ vpand RBYTE, ab ## 0, RIDX; \ vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ vpcmpeqd RNOT, RNOT, RNOT; \ \ vpand RBYTE, ab ## 1, RIDX; \ vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ vpcmpeqd RNOT, RNOT, RNOT; \ \ vpsrld $8, ab ## 0, RIDX; \ vpand RBYTE, RIDX, RIDX; \ vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ vpcmpeqd RNOT, RNOT, RNOT; \ vpxor RT0, xy ## 0, xy ## 0; \ \ vpsrld $8, ab ## 1, RIDX; \ vpand RBYTE, RIDX, RIDX; \ vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ vpcmpeqd RNOT, RNOT, RNOT; \ vpxor RT0, xy ## 1, xy ## 1; \ \ vpsrld $16, ab ## 0, RIDX; \ vpand RBYTE, RIDX, RIDX; \ vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ vpcmpeqd RNOT, RNOT, RNOT; \ vpxor RT0, xy ## 0, xy ## 0; \ \ vpsrld $16, ab ## 1, RIDX; \ vpand RBYTE, RIDX, RIDX; \ vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ vpcmpeqd RNOT, RNOT, RNOT; \ vpxor RT0, xy ## 1, xy ## 1; \ \ vpsrld $24, ab ## 0, RIDX; \ vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ vpcmpeqd RNOT, RNOT, RNOT; \ vpxor RT0, xy ## 0, xy ## 0; \ \ vpsrld $24, ab ## 1, RIDX; \ vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ vpcmpeqd RNOT, RNOT, RNOT; \ vpxor RT0, xy ## 1, xy ## 1; #define g1_16(a, x) \ g16(a, RS0, RS1, RS2, RS3, x); #define g2_16(b, y) \ g16(b, RS1, RS2, RS3, RS0, y); #define encrypt_round_end16(a, b, c, d, nk, r) \ vpaddd RY0, RX0, RX0; \ vpaddd RX0, RY0, RY0; \ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RX0, RX0; \ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RY0, RY0; \ \ vpxor RY0, d ## 0, d ## 0; \ \ vpxor RX0, c ## 0, c ## 0; \ vpsrld $1, c ## 0, RT0; \ vpslld $31, c ## 0, c ## 0; \ vpor RT0, c ## 0, c ## 0; \ \ vpaddd RY1, RX1, RX1; \ vpaddd RX1, RY1, RY1; \ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RX1, RX1; \ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RY1, RY1; \ \ vpxor RY1, d ## 1, d ## 1; \ \ vpxor RX1, c ## 1, c ## 1; \ vpsrld $1, c ## 1, RT0; \ vpslld $31, c ## 1, c ## 1; \ vpor RT0, c ## 1, c ## 1; \ #define encrypt_round16(a, b, c, d, nk, r) \ g2_16(b, RY); \ \ vpslld $1, b ## 0, RT0; \ vpsrld $31, b ## 0, b ## 0; \ vpor RT0, b ## 0, b ## 0; \ \ vpslld $1, b ## 1, RT0; \ vpsrld $31, b ## 1, b ## 1; \ vpor RT0, b ## 1, b ## 1; \ \ g1_16(a, RX); \ \ encrypt_round_end16(a, b, c, d, nk, r); #define encrypt_round_first16(a, b, c, d, nk, r) \ vpslld $1, d ## 0, RT0; \ vpsrld $31, d ## 0, d ## 0; \ vpor RT0, d ## 0, d ## 0; \ \ vpslld $1, d ## 1, RT0; \ vpsrld $31, d ## 1, d ## 1; \ vpor RT0, d ## 1, d ## 1; \ \ encrypt_round16(a, b, c, d, nk, r); #define encrypt_round_last16(a, b, c, d, nk, r) \ g2_16(b, RY); \ \ g1_16(a, RX); \ \ encrypt_round_end16(a, b, c, d, nk, r); #define decrypt_round_end16(a, b, c, d, nk, r) \ vpaddd RY0, RX0, RX0; \ vpaddd RX0, RY0, RY0; \ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RX0, RX0; \ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RY0, RY0; \ \ vpxor RX0, c ## 0, c ## 0; \ \ vpxor RY0, d ## 0, d ## 0; \ vpsrld $1, d ## 0, RT0; \ vpslld $31, d ## 0, d ## 0; \ vpor RT0, d ## 0, d ## 0; \ \ vpaddd RY1, RX1, RX1; \ vpaddd RX1, RY1, RY1; \ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RX1, RX1; \ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \ vpaddd RT0, RY1, RY1; \ \ vpxor RX1, c ## 1, c ## 1; \ \ vpxor RY1, d ## 1, d ## 1; \ vpsrld $1, d ## 1, RT0; \ vpslld $31, d ## 1, d ## 1; \ vpor RT0, d ## 1, d ## 1; #define decrypt_round16(a, b, c, d, nk, r) \ g1_16(a, RX); \ \ vpslld $1, a ## 0, RT0; \ vpsrld $31, a ## 0, a ## 0; \ vpor RT0, a ## 0, a ## 0; \ \ vpslld $1, a ## 1, RT0; \ vpsrld $31, a ## 1, a ## 1; \ vpor RT0, a ## 1, a ## 1; \ \ g2_16(b, RY); \ \ decrypt_round_end16(a, b, c, d, nk, r); #define decrypt_round_first16(a, b, c, d, nk, r) \ vpslld $1, c ## 0, RT0; \ vpsrld $31, c ## 0, c ## 0; \ vpor RT0, c ## 0, c ## 0; \ \ vpslld $1, c ## 1, RT0; \ vpsrld $31, c ## 1, c ## 1; \ vpor RT0, c ## 1, c ## 1; \ \ decrypt_round16(a, b, c, d, nk, r) #define decrypt_round_last16(a, b, c, d, nk, r) \ g1_16(a, RX); \ \ g2_16(b, RY); \ \ decrypt_round_end16(a, b, c, d, nk, r); #define encrypt_cycle16(r) \ encrypt_round16(RA, RB, RC, RD, 0, r); \ encrypt_round16(RC, RD, RA, RB, 8, r); #define encrypt_cycle_first16(r) \ encrypt_round_first16(RA, RB, RC, RD, 0, r); \ encrypt_round16(RC, RD, RA, RB, 8, r); #define encrypt_cycle_last16(r) \ encrypt_round16(RA, RB, RC, RD, 0, r); \ encrypt_round_last16(RC, RD, RA, RB, 8, r); #define decrypt_cycle16(r) \ decrypt_round16(RC, RD, RA, RB, 8, r); \ decrypt_round16(RA, RB, RC, RD, 0, r); #define decrypt_cycle_first16(r) \ decrypt_round_first16(RC, RD, RA, RB, 8, r); \ decrypt_round16(RA, RB, RC, RD, 0, r); #define decrypt_cycle_last16(r) \ decrypt_round16(RC, RD, RA, RB, 8, r); \ decrypt_round_last16(RA, RB, RC, RD, 0, r); #define transpose_4x4(x0,x1,x2,x3,t1,t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; #define read_blocks8(offs,a,b,c,d) \ vmovdqu 16*offs(RIO), a; \ vmovdqu 16*offs+32(RIO), b; \ vmovdqu 16*offs+64(RIO), c; \ vmovdqu 16*offs+96(RIO), d; \ \ transpose_4x4(a, b, c, d, RX0, RY0); #define write_blocks8(offs,a,b,c,d) \ transpose_4x4(a, b, c, d, RX0, RY0); \ \ vmovdqu a, 16*offs(RIO); \ vmovdqu b, 16*offs+32(RIO); \ vmovdqu c, 16*offs+64(RIO); \ vmovdqu d, 16*offs+96(RIO); #define inpack_enc8(a,b,c,d) \ vpbroadcastd 4*0(RW), RT0; \ vpxor RT0, a, a; \ \ vpbroadcastd 4*1(RW), RT0; \ vpxor RT0, b, b; \ \ vpbroadcastd 4*2(RW), RT0; \ vpxor RT0, c, c; \ \ vpbroadcastd 4*3(RW), RT0; \ vpxor RT0, d, d; #define outunpack_enc8(a,b,c,d) \ vpbroadcastd 4*4(RW), RX0; \ vpbroadcastd 4*5(RW), RY0; \ vpxor RX0, c, RX0; \ vpxor RY0, d, RY0; \ \ vpbroadcastd 4*6(RW), RT0; \ vpxor RT0, a, c; \ vpbroadcastd 4*7(RW), RT0; \ vpxor RT0, b, d; \ \ vmovdqa RX0, a; \ vmovdqa RY0, b; #define inpack_dec8(a,b,c,d) \ vpbroadcastd 4*4(RW), RX0; \ vpbroadcastd 4*5(RW), RY0; \ vpxor RX0, a, RX0; \ vpxor RY0, b, RY0; \ \ vpbroadcastd 4*6(RW), RT0; \ vpxor RT0, c, a; \ vpbroadcastd 4*7(RW), RT0; \ vpxor RT0, d, b; \ \ vmovdqa RX0, c; \ vmovdqa RY0, d; #define outunpack_dec8(a,b,c,d) \ vpbroadcastd 4*0(RW), RT0; \ vpxor RT0, a, a; \ \ vpbroadcastd 4*1(RW), RT0; \ vpxor RT0, b, b; \ \ vpbroadcastd 4*2(RW), RT0; \ vpxor RT0, c, c; \ \ vpbroadcastd 4*3(RW), RT0; \ vpxor RT0, d, d; #define transpose4x4_16(a,b,c,d) \ transpose_4x4(a ## 0, b ## 0, c ## 0, d ## 0, RX0, RY0); \ transpose_4x4(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0); #define inpack_enc16(a,b,c,d) \ inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); #define outunpack_enc16(a,b,c,d) \ outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); #define inpack_dec16(a,b,c,d) \ inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); #define outunpack_dec16(a,b,c,d) \ outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); .align 8 ELF(.type __twofish_enc_blk16,@function;) __twofish_enc_blk16: /* input: * %rdi: ctx, CTX * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel * plaintext blocks * output: * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel * ciphertext blocks */ CFI_STARTPROC(); init_round_constants(); transpose4x4_16(RA, RB, RC, RD); inpack_enc16(RA, RB, RC, RD); encrypt_cycle_first16(0); encrypt_cycle16(2); encrypt_cycle16(4); encrypt_cycle16(6); encrypt_cycle16(8); encrypt_cycle16(10); encrypt_cycle16(12); encrypt_cycle_last16(14); outunpack_enc16(RA, RB, RC, RD); transpose4x4_16(RA, RB, RC, RD); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;) .align 8 ELF(.type __twofish_dec_blk16,@function;) __twofish_dec_blk16: /* input: * %rdi: ctx, CTX * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel * plaintext blocks * output: * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel * ciphertext blocks */ CFI_STARTPROC(); init_round_constants(); transpose4x4_16(RA, RB, RC, RD); inpack_dec16(RA, RB, RC, RD); decrypt_cycle_first16(14); decrypt_cycle16(12); decrypt_cycle16(10); decrypt_cycle16(8); decrypt_cycle16(6); decrypt_cycle16(4); decrypt_cycle16(2); decrypt_cycle_last16(0); outunpack_dec16(RA, RB, RC, RD); transpose4x4_16(RA, RB, RC, RD); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; .align 8 .globl _gcry_twofish_avx2_ctr_enc ELF(.type _gcry_twofish_avx2_ctr_enc,@function;) _gcry_twofish_avx2_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); movq 8(%rcx), %rax; bswapq %rax; vzeroupper; vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ /* load IV and byteswap */ vmovdqu (%rcx), RTMP4x; vpshufb RTMP3x, RTMP4x, RTMP4x; vmovdqa RTMP4x, RTMP0x; inc_le128(RTMP4x, RNOTx, RTMP1x); vinserti128 $1, RTMP4x, RTMP0, RTMP0; vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */ /* check need for handling 64-bit overflow and carry */ cmpq $(0xffffffffffffffff - 16), %rax; ja .Lhandle_ctr_carry; /* construct IVs */ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */ vpshufb RTMP3, RTMP0, RB0; vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */ vpshufb RTMP3, RTMP0, RC0; vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */ vpshufb RTMP3, RTMP0, RD0; vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */ vpshufb RTMP3, RTMP0, RA1; vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */ vpshufb RTMP3, RTMP0, RB1; vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */ vpshufb RTMP3, RTMP0, RC1; vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */ vpshufb RTMP3, RTMP0, RD1; vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ vpshufb RTMP3x, RTMP0x, RTMP0x; jmp .Lctr_carry_done; .Lhandle_ctr_carry: /* construct IVs */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB0; /* +3 ; +2 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RC0; /* +5 ; +4 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RD0; /* +7 ; +6 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RA1; /* +9 ; +8 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RC1; /* +13 ; +12 */ inc_le128(RTMP0, RNOT, RTMP1); inc_le128(RTMP0, RNOT, RTMP1); vpshufb RTMP3, RTMP0, RD1; /* +15 ; +14 */ inc_le128(RTMP0, RNOT, RTMP1); vextracti128 $1, RTMP0, RTMP0x; vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ .align 4 .Lctr_carry_done: /* store new IV */ vmovdqu RTMP0x, (%rcx); call __twofish_enc_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; vpxor (1 * 32)(%rdx), RB0, RB0; vpxor (2 * 32)(%rdx), RC0, RC0; vpxor (3 * 32)(%rdx), RD0, RD0; vpxor (4 * 32)(%rdx), RA1, RA1; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RC1, RC1; vpxor (7 * 32)(%rdx), RD1, RD1; vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RB0, (1 * 32)(%rsi); vmovdqu RC0, (2 * 32)(%rsi); vmovdqu RD0, (3 * 32)(%rsi); vmovdqu RA1, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RC1, (6 * 32)(%rsi); vmovdqu RD1, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;) .align 8 .globl _gcry_twofish_avx2_cbc_dec ELF(.type _gcry_twofish_avx2_cbc_dec,@function;) _gcry_twofish_avx2_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; vmovdqu (0 * 32)(%rdx), RA0; vmovdqu (1 * 32)(%rdx), RB0; vmovdqu (2 * 32)(%rdx), RC0; vmovdqu (3 * 32)(%rdx), RD0; vmovdqu (4 * 32)(%rdx), RA1; vmovdqu (5 * 32)(%rdx), RB1; vmovdqu (6 * 32)(%rdx), RC1; vmovdqu (7 * 32)(%rdx), RD1; call __twofish_dec_blk16; vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RNOT; vpxor RNOT, RA0, RA0; vpxor (0 * 32 + 16)(%rdx), RB0, RB0; vpxor (1 * 32 + 16)(%rdx), RC0, RC0; vpxor (2 * 32 + 16)(%rdx), RD0, RD0; vpxor (3 * 32 + 16)(%rdx), RA1, RA1; vpxor (4 * 32 + 16)(%rdx), RB1, RB1; vpxor (5 * 32 + 16)(%rdx), RC1, RC1; vpxor (6 * 32 + 16)(%rdx), RD1, RD1; vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); /* store new IV */ vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RB0, (1 * 32)(%rsi); vmovdqu RC0, (2 * 32)(%rsi); vmovdqu RD0, (3 * 32)(%rsi); vmovdqu RA1, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RC1, (6 * 32)(%rsi); vmovdqu RD1, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;) .align 8 .globl _gcry_twofish_avx2_cfb_dec ELF(.type _gcry_twofish_avx2_cfb_dec,@function;) _gcry_twofish_avx2_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: iv */ CFI_STARTPROC(); vzeroupper; /* Load input */ vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RA0; vmovdqu (0 * 32 + 16)(%rdx), RB0; vmovdqu (1 * 32 + 16)(%rdx), RC0; vmovdqu (2 * 32 + 16)(%rdx), RD0; vmovdqu (3 * 32 + 16)(%rdx), RA1; vmovdqu (4 * 32 + 16)(%rdx), RB1; vmovdqu (5 * 32 + 16)(%rdx), RC1; vmovdqu (6 * 32 + 16)(%rdx), RD1; /* Update IV */ vmovdqu (7 * 32 + 16)(%rdx), RNOTx; vmovdqu RNOTx, (%rcx); call __twofish_enc_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; vpxor (1 * 32)(%rdx), RB0, RB0; vpxor (2 * 32)(%rdx), RC0, RC0; vpxor (3 * 32)(%rdx), RD0, RD0; vpxor (4 * 32)(%rdx), RA1, RA1; vpxor (5 * 32)(%rdx), RB1, RB1; vpxor (6 * 32)(%rdx), RC1, RC1; vpxor (7 * 32)(%rdx), RD1, RD1; vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RB0, (1 * 32)(%rsi); vmovdqu RC0, (2 * 32)(%rsi); vmovdqu RD0, (3 * 32)(%rsi); vmovdqu RA1, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RC1, (6 * 32)(%rsi); vmovdqu RD1, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;) .align 8 .globl _gcry_twofish_avx2_ocb_enc ELF(.type _gcry_twofish_avx2_ocb_enc,@function;) _gcry_twofish_avx2_ocb_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; vmovdqu (%r8), RTMP1x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RTMP1, RTMP1; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RB0); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RC0); OCB_INPUT(3, %r12, %r13, RD0); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RA1); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RC1); OCB_INPUT(7, %r12, %r13, RD1); #undef OCB_INPUT vextracti128 $1, RTMP1, RNOTx; vmovdqu RTMP0x, (%rcx); vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%r8); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __twofish_enc_blk16; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor (0 * 32)(%rsi), RA0, RA0; vpxor (1 * 32)(%rsi), RB0, RB0; vpxor (2 * 32)(%rsi), RC0, RC0; vpxor (3 * 32)(%rsi), RD0, RD0; vpxor (4 * 32)(%rsi), RA1, RA1; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RC1, RC1; vpxor (7 * 32)(%rsi), RD1, RD1; vmovdqu RA0, (0 * 32)(%rsi); vmovdqu RB0, (1 * 32)(%rsi); vmovdqu RC0, (2 * 32)(%rsi); vmovdqu RD0, (3 * 32)(%rsi); vmovdqu RA1, (4 * 32)(%rsi); vmovdqu RB1, (5 * 32)(%rsi); vmovdqu RC1, (6 * 32)(%rsi); vmovdqu RD1, (7 * 32)(%rsi); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;) .align 8 .globl _gcry_twofish_avx2_ocb_dec ELF(.type _gcry_twofish_avx2_ocb_dec,@function;) _gcry_twofish_avx2_ocb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) * %rdx: src (16 blocks) * %rcx: offset * %r8 : checksum * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rdx), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; \ vmovdqu RNOT, (n * 32)(%rsi); movq (0 * 8)(%r9), %r10; movq (1 * 8)(%r9), %r11; movq (2 * 8)(%r9), %r12; movq (3 * 8)(%r9), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RB0); movq (4 * 8)(%r9), %r10; movq (5 * 8)(%r9), %r11; movq (6 * 8)(%r9), %r12; movq (7 * 8)(%r9), %r13; OCB_INPUT(2, %r10, %r11, RC0); OCB_INPUT(3, %r12, %r13, RD0); movq (8 * 8)(%r9), %r10; movq (9 * 8)(%r9), %r11; movq (10 * 8)(%r9), %r12; movq (11 * 8)(%r9), %r13; OCB_INPUT(4, %r10, %r11, RA1); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r9), %r10; movq (13 * 8)(%r9), %r11; movq (14 * 8)(%r9), %r12; movq (15 * 8)(%r9), %r13; OCB_INPUT(6, %r10, %r11, RC1); OCB_INPUT(7, %r12, %r13, RD1); #undef OCB_INPUT vmovdqu RTMP0x, (%rcx); mov %r8, %rcx movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __twofish_dec_blk16; vmovdqu (%rcx), RTMP1x; vpxor (0 * 32)(%rsi), RA0, RA0; vpxor (1 * 32)(%rsi), RB0, RB0; vpxor (2 * 32)(%rsi), RC0, RC0; vpxor (3 * 32)(%rsi), RD0, RD0; vpxor (4 * 32)(%rsi), RA1, RA1; vpxor (5 * 32)(%rsi), RB1, RB1; vpxor (6 * 32)(%rsi), RC1, RC1; vpxor (7 * 32)(%rsi), RD1, RD1; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); /* Checksum_i = Checksum_{i-1} xor P_i */ vmovdqu RA0, (0 * 32)(%rsi); vpxor RA0, RTMP1, RTMP1; vmovdqu RB0, (1 * 32)(%rsi); vpxor RB0, RTMP1, RTMP1; vmovdqu RC0, (2 * 32)(%rsi); vpxor RC0, RTMP1, RTMP1; vmovdqu RD0, (3 * 32)(%rsi); vpxor RD0, RTMP1, RTMP1; vmovdqu RA1, (4 * 32)(%rsi); vpxor RA1, RTMP1, RTMP1; vmovdqu RB1, (5 * 32)(%rsi); vpxor RB1, RTMP1, RTMP1; vmovdqu RC1, (6 * 32)(%rsi); vpxor RC1, RTMP1, RTMP1; vmovdqu RD1, (7 * 32)(%rsi); vpxor RD1, RTMP1, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%rcx); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;) .align 8 .globl _gcry_twofish_avx2_ocb_auth ELF(.type _gcry_twofish_avx2_ocb_auth,@function;) _gcry_twofish_avx2_ocb_auth: /* input: * %rdi: ctx, CTX * %rsi: abuf (16 blocks) * %rdx: offset * %rcx: checksum * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); CFI_REL_OFFSET(%r10, 0 * 8); CFI_REL_OFFSET(%r11, 1 * 8); CFI_REL_OFFSET(%r12, 2 * 8); CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rdx), RTMP0x; /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ #define OCB_INPUT(n, l0reg, l1reg, yreg) \ vmovdqu (n * 32)(%rsi), yreg; \ vpxor (l0reg), RTMP0x, RNOTx; \ vpxor (l1reg), RNOTx, RTMP0x; \ vinserti128 $1, RTMP0x, RNOT, RNOT; \ vpxor yreg, RNOT, yreg; movq (0 * 8)(%r8), %r10; movq (1 * 8)(%r8), %r11; movq (2 * 8)(%r8), %r12; movq (3 * 8)(%r8), %r13; OCB_INPUT(0, %r10, %r11, RA0); OCB_INPUT(1, %r12, %r13, RB0); movq (4 * 8)(%r8), %r10; movq (5 * 8)(%r8), %r11; movq (6 * 8)(%r8), %r12; movq (7 * 8)(%r8), %r13; OCB_INPUT(2, %r10, %r11, RC0); OCB_INPUT(3, %r12, %r13, RD0); movq (8 * 8)(%r8), %r10; movq (9 * 8)(%r8), %r11; movq (10 * 8)(%r8), %r12; movq (11 * 8)(%r8), %r13; OCB_INPUT(4, %r10, %r11, RA1); OCB_INPUT(5, %r12, %r13, RB1); movq (12 * 8)(%r8), %r10; movq (13 * 8)(%r8), %r11; movq (14 * 8)(%r8), %r12; movq (15 * 8)(%r8), %r13; OCB_INPUT(6, %r10, %r11, RC1); OCB_INPUT(7, %r12, %r13, RD1); #undef OCB_INPUT vmovdqu RTMP0x, (%rdx); movq (0 * 8)(%rsp), %r10; movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; CFI_RESTORE(%r10); CFI_RESTORE(%r11); CFI_RESTORE(%r12); CFI_RESTORE(%r13); call __twofish_enc_blk16; vpxor RA0, RB0, RA0; vpxor RC0, RD0, RC0; vpxor RA1, RB1, RA1; vpxor RC1, RD1, RC1; vpxor RA0, RC0, RA0; vpxor RA1, RC1, RA1; addq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor RA1, RA0, RTMP1; vextracti128 $1, RTMP1, RNOTx; vpxor (%rcx), RTMP1x, RTMP1x; vpxor RNOTx, RTMP1x, RTMP1x; vmovdqu RTMP1x, (%rcx); vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;) .align 16 /* For CTR-mode IV byteswap */ _gcry_twofish_bswap128_mask: .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;) #endif /*defined(USE_TWOFISH) && defined(ENABLE_AVX2_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S index 5631dc56..37648faa 100644 --- a/cipher/whirlpool-sse2-amd64.S +++ b/cipher/whirlpool-sse2-amd64.S @@ -1,348 +1,348 @@ /* whirlpool-sse2-amd64.S - AMD64 assembly implementation of Whirlpool * * Copyright (C) 2014 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_WHIRLPOOL) #include "asm-common-amd64.h" .text /* look-up table offsets on RTAB */ #define RC (0) #define C0 (RC + (8 * 10)) #define C1 (C0 + (8 * 256)) #define C2 (C1 + (8 * 256)) #define C3 (C2 + (8 * 256)) #define C4 (C3 + (8 * 256)) #define C5 (C4 + (8 * 256)) #define C6 (C5 + (8 * 256)) #define C7 (C6 + (8 * 256)) /* stack variables */ #define STACK_DATAP (0) #define STACK_STATEP (STACK_DATAP + 8) #define STACK_ROUNDS (STACK_STATEP + 8) #define STACK_NBLKS (STACK_ROUNDS + 8) #define STACK_RBP (STACK_NBLKS + 8) #define STACK_RBX (STACK_RBP + 8) #define STACK_R12 (STACK_RBX + 8) #define STACK_R13 (STACK_R12 + 8) #define STACK_R14 (STACK_R13 + 8) #define STACK_R15 (STACK_R14 + 8) #define STACK_MAX (STACK_R15 + 8) /* register macros */ #define RTAB %rbp #define RI1 %rax #define RI2 %rbx #define RI3 %rcx #define RI4 %rdx #define RI1d %eax #define RI2d %ebx #define RI3d %ecx #define RI4d %edx #define RI1bl %al #define RI2bl %bl #define RI3bl %cl #define RI4bl %dl #define RI1bh %ah #define RI2bh %bh #define RI3bh %ch #define RI4bh %dh #define RB0 %r8 #define RB1 %r9 #define RB2 %r10 #define RB3 %r11 #define RB4 %r12 #define RB5 %r13 #define RB6 %r14 #define RB7 %r15 #define RT0 %rsi #define RT1 %rdi #define RT0d %esi #define RT1d %edi #define XKEY0 %xmm0 #define XKEY1 %xmm1 #define XKEY2 %xmm2 #define XKEY3 %xmm3 #define XKEY4 %xmm4 #define XKEY5 %xmm5 #define XKEY6 %xmm6 #define XKEY7 %xmm7 #define XSTATE0 %xmm8 #define XSTATE1 %xmm9 #define XSTATE2 %xmm10 #define XSTATE3 %xmm11 #define XSTATE4 %xmm12 #define XSTATE5 %xmm13 #define XSTATE6 %xmm14 #define XSTATE7 %xmm15 /*********************************************************************** * AMD64 assembly implementation of Whirlpool. * - Using table-lookups * - Store state in XMM registers ***********************************************************************/ #define __do_whirl(op, ri, \ b0, b1, b2, b3, b4, b5, b6, b7, \ load_ri, load_arg) \ movzbl ri ## bl, RT0d; \ movzbl ri ## bh, RT1d; \ shrq $16, ri; \ op ## q C7(RTAB,RT0,8), b7; \ op ## q C6(RTAB,RT1,8), b6; \ movzbl ri ## bl, RT0d; \ movzbl ri ## bh, RT1d; \ shrq $16, ri; \ op ## q C5(RTAB,RT0,8), b5; \ op ## q C4(RTAB,RT1,8), b4; \ movzbl ri ## bl, RT0d; \ movzbl ri ## bh, RT1d; \ shrl $16, ri ## d; \ op ## q C3(RTAB,RT0,8), b3; \ op ## q C2(RTAB,RT1,8), b2; \ movzbl ri ## bl, RT0d; \ movzbl ri ## bh, RT1d; \ load_ri( load_arg, ri); \ op ## q C1(RTAB,RT0,8), b1; \ op ## q C0(RTAB,RT1,8), b0; #define do_whirl(op, ri, rb_add, load_ri, load_arg) \ __do_whirl(op, ##ri, rb_add, load_ri, load_arg) #define dummy(...) /*_*/ #define do_movq(src, dst) movq src, dst; #define RB_ADD0 RB0, RB1, RB2, RB3, RB4, RB5, RB6, RB7 #define RB_ADD1 RB1, RB2, RB3, RB4, RB5, RB6, RB7, RB0 #define RB_ADD2 RB2, RB3, RB4, RB5, RB6, RB7, RB0, RB1 #define RB_ADD3 RB3, RB4, RB5, RB6, RB7, RB0, RB1, RB2 #define RB_ADD4 RB4, RB5, RB6, RB7, RB0, RB1, RB2, RB3 #define RB_ADD5 RB5, RB6, RB7, RB0, RB1, RB2, RB3, RB4 #define RB_ADD6 RB6, RB7, RB0, RB1, RB2, RB3, RB4, RB5 #define RB_ADD7 RB7, RB0, RB1, RB2, RB3, RB4, RB5, RB6 .align 8 .globl _gcry_whirlpool_transform_amd64 ELF(.type _gcry_whirlpool_transform_amd64,@function;) _gcry_whirlpool_transform_amd64: /* input: * %rdi: state * %rsi: inblk * %rdx: nblks * %rcx: look-up tables */ CFI_STARTPROC(); cmp $0, %rdx; je .Lskip; subq $STACK_MAX, %rsp; CFI_ADJUST_CFA_OFFSET(STACK_MAX); movq %rbp, STACK_RBP(%rsp); movq %rbx, STACK_RBX(%rsp); movq %r12, STACK_R12(%rsp); movq %r13, STACK_R13(%rsp); movq %r14, STACK_R14(%rsp); movq %r15, STACK_R15(%rsp); CFI_REL_OFFSET(%rbp, STACK_RBP); CFI_REL_OFFSET(%rbx, STACK_RBX); CFI_REL_OFFSET(%r12, STACK_R12); CFI_REL_OFFSET(%r13, STACK_R13); CFI_REL_OFFSET(%r14, STACK_R14); CFI_REL_OFFSET(%r15, STACK_R15); movq %rdx, STACK_NBLKS(%rsp); movq %rdi, STACK_STATEP(%rsp); movq %rsi, STACK_DATAP(%rsp); movq %rcx, RTAB; jmp .Lfirst_block; .align 8 .Lblock_loop: movq STACK_DATAP(%rsp), %rsi; movq RI1, %rdi; .Lfirst_block: /* load data_block */ movq 0*8(%rsi), RB0; movq 1*8(%rsi), RB1; bswapq RB0; movq 2*8(%rsi), RB2; bswapq RB1; movq 3*8(%rsi), RB3; bswapq RB2; movq 4*8(%rsi), RB4; bswapq RB3; movq 5*8(%rsi), RB5; bswapq RB4; movq RB0, XSTATE0; movq 6*8(%rsi), RB6; bswapq RB5; movq RB1, XSTATE1; movq 7*8(%rsi), RB7; bswapq RB6; movq RB2, XSTATE2; bswapq RB7; movq RB3, XSTATE3; movq RB4, XSTATE4; movq RB5, XSTATE5; movq RB6, XSTATE6; movq RB7, XSTATE7; /* load key */ movq 0*8(%rdi), XKEY0; movq 1*8(%rdi), XKEY1; movq 2*8(%rdi), XKEY2; movq 3*8(%rdi), XKEY3; movq 4*8(%rdi), XKEY4; movq 5*8(%rdi), XKEY5; movq 6*8(%rdi), XKEY6; movq 7*8(%rdi), XKEY7; movq XKEY0, RI1; movq XKEY1, RI2; movq XKEY2, RI3; movq XKEY3, RI4; /* prepare and store state */ pxor XKEY0, XSTATE0; pxor XKEY1, XSTATE1; pxor XKEY2, XSTATE2; pxor XKEY3, XSTATE3; pxor XKEY4, XSTATE4; pxor XKEY5, XSTATE5; pxor XKEY6, XSTATE6; pxor XKEY7, XSTATE7; movq XSTATE0, 0*8(%rdi); movq XSTATE1, 1*8(%rdi); movq XSTATE2, 2*8(%rdi); movq XSTATE3, 3*8(%rdi); movq XSTATE4, 4*8(%rdi); movq XSTATE5, 5*8(%rdi); movq XSTATE6, 6*8(%rdi); movq XSTATE7, 7*8(%rdi); addq $64, STACK_DATAP(%rsp); movl $(0), STACK_ROUNDS(%rsp); .align 8 .Lround_loop: do_whirl(mov, RI1 /*XKEY0*/, RB_ADD0, do_movq, XKEY4); do_whirl(xor, RI2 /*XKEY1*/, RB_ADD1, do_movq, XKEY5); do_whirl(xor, RI3 /*XKEY2*/, RB_ADD2, do_movq, XKEY6); do_whirl(xor, RI4 /*XKEY3*/, RB_ADD3, do_movq, XKEY7); do_whirl(xor, RI1 /*XKEY0*/, RB_ADD4, do_movq, XSTATE0); do_whirl(xor, RI2 /*XKEY1*/, RB_ADD5, do_movq, XSTATE1); do_whirl(xor, RI3 /*XKEY2*/, RB_ADD6, do_movq, XSTATE2); do_whirl(xor, RI4 /*XKEY3*/, RB_ADD7, do_movq, XSTATE3); movl STACK_ROUNDS(%rsp), RT0d; movq RB1, XKEY1; addl $1, STACK_ROUNDS(%rsp); movq RB2, XKEY2; movq RB3, XKEY3; xorq RC(RTAB,RT0,8), RB0; /* Add round constant */ movq RB4, XKEY4; movq RB5, XKEY5; movq RB0, XKEY0; movq RB6, XKEY6; movq RB7, XKEY7; do_whirl(xor, RI1 /*XSTATE0*/, RB_ADD0, do_movq, XSTATE4); do_whirl(xor, RI2 /*XSTATE1*/, RB_ADD1, do_movq, XSTATE5); do_whirl(xor, RI3 /*XSTATE2*/, RB_ADD2, do_movq, XSTATE6); do_whirl(xor, RI4 /*XSTATE3*/, RB_ADD3, do_movq, XSTATE7); cmpl $10, STACK_ROUNDS(%rsp); je .Lis_last_round; do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, do_movq, XKEY0); do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, do_movq, XKEY1); do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, do_movq, XKEY2); do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, do_movq, XKEY3); movq RB0, XSTATE0; movq RB1, XSTATE1; movq RB2, XSTATE2; movq RB3, XSTATE3; movq RB4, XSTATE4; movq RB5, XSTATE5; movq RB6, XSTATE6; movq RB7, XSTATE7; jmp .Lround_loop; .align 8 .Lis_last_round: do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, dummy, _); movq STACK_STATEP(%rsp), RI1; do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, dummy, _); do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, dummy, _); do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, dummy, _); /* store state */ xorq RB0, 0*8(RI1); xorq RB1, 1*8(RI1); xorq RB2, 2*8(RI1); xorq RB3, 3*8(RI1); xorq RB4, 4*8(RI1); xorq RB5, 5*8(RI1); xorq RB6, 6*8(RI1); xorq RB7, 7*8(RI1); subq $1, STACK_NBLKS(%rsp); jnz .Lblock_loop; movq STACK_RBP(%rsp), %rbp; movq STACK_RBX(%rsp), %rbx; movq STACK_R12(%rsp), %r12; movq STACK_R13(%rsp), %r13; movq STACK_R14(%rsp), %r14; movq STACK_R15(%rsp), %r15; CFI_RESTORE(%rbp); CFI_RESTORE(%rbx); CFI_RESTORE(%r12); CFI_RESTORE(%r13); CFI_RESTORE(%r14); CFI_RESTORE(%r15); addq $STACK_MAX, %rsp; CFI_ADJUST_CFA_OFFSET(-STACK_MAX); .Lskip: movl $(STACK_MAX + 8), %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;) #endif #endif diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h index a60363e4..c3f2d026 100644 --- a/mpi/amd64/func_abi.h +++ b/mpi/amd64/func_abi.h @@ -1,56 +1,34 @@ #include -#ifdef __x86_64__ -#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES -# define CFI_STARTPROC() .cfi_startproc -# define CFI_ENDPROC() .cfi_endproc -# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off -# define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off -# define CFI_RESTORE(reg) .cfi_restore reg - -# define CFI_PUSH(reg) \ - CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0) -# define CFI_POP(reg) \ - CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg) -#else -# define CFI_STARTPROC() -# define CFI_ENDPROC() -# define CFI_ADJUST_CFA_OFFSET(off) -# define CFI_REL_OFFSET(reg,off) -# define CFI_RESTORE(reg) - -# define CFI_PUSH(reg) -# define CFI_POP(reg) -#endif -#endif +#include "asm-common-amd64.h" #ifdef USE_MS_ABI /* Store registers and move four first input arguments from MS ABI to * SYSV ABI. */ #define FUNC_ENTRY() \ CFI_STARTPROC(); \ pushq %rsi; \ CFI_PUSH(%rsi); \ pushq %rdi; \ CFI_PUSH(%rdi); \ movq %rdx, %rsi; \ movq %rcx, %rdi; \ movq %r8, %rdx; \ movq %r9, %rcx; /* Restore registers. */ #define FUNC_EXIT() \ popq %rdi; \ CFI_POP(%rdi); \ popq %rsi; \ CFI_POP(%rsi); \ - ret; \ + ret_spec_stop; \ CFI_ENDPROC(); #else #define FUNC_ENTRY() \ CFI_STARTPROC(); #define FUNC_EXIT() \ - ret; \ + ret_spec_stop; \ CFI_ENDPROC(); #endif diff --git a/mpi/asm-common-amd64.h b/mpi/asm-common-amd64.h new file mode 100644 index 00000000..ad0e8e62 --- /dev/null +++ b/mpi/asm-common-amd64.h @@ -0,0 +1,26 @@ +/* asm-common-amd64.h - Common macros for AMD64 assembly + * + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef MPI_ASM_COMMON_AMD64_H +#define MPI_ASM_COMMON_AMD64_H + +#include "../cipher/asm-common-amd64.h" + +#endif /* MPI_ASM_COMMON_AMD64_H */ diff --git a/mpi/i386/mpih-add1.S b/mpi/i386/mpih-add1.S index de78a0cb..95a75890 100644 --- a/mpi/i386/mpih-add1.S +++ b/mpi/i386/mpih-add1.S @@ -1,161 +1,161 @@ /* i80386 add_n -- Add two limb vectors of the same length > 0 and store * sum in a third limb vector. * * Copyright (C) 1992, 1994, 1995, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include "sysdep.h" #include "asm-syntax.h" /******************* * mpi_limb_t * _gcry_mpih_add_n( mpi_ptr_t res_ptr, (sp + 4) * mpi_ptr_t s1_ptr, (sp + 8) * mpi_ptr_t s2_ptr, (sp + 12) * mpi_size_t size) (sp + 16) */ .text ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_add_n) C_SYMBOL_NAME(_gcry_mpih_add_n:) CFI_STARTPROC() pushl %edi CFI_PUSH(%edi) pushl %esi CFI_PUSH(%esi) movl 12(%esp),%edi /* res_ptr */ movl 16(%esp),%esi /* s1_ptr */ movl 20(%esp),%edx /* s2_ptr */ movl 24(%esp),%ecx /* size */ #if defined __CET__ && (__CET__ & 1) != 0 pushl %ebx CFI_PUSH(%ebx) #endif movl %ecx,%eax shrl $3,%ecx /* compute count for unrolled loop */ negl %eax andl $7,%eax /* get index where to start loop */ jz Loop /* necessary special case for 0 */ incl %ecx /* adjust loop count */ shll $2,%eax /* adjustment for pointers... */ subl %eax,%edi /* ... since they are offset ... */ subl %eax,%esi /* ... by a constant when we ... */ subl %eax,%edx /* ... enter the loop */ shrl $2,%eax /* restore previous value */ #if defined __CET__ && (__CET__ & 1) != 0 leal -4(,%eax,4),%ebx /* Count for 4-byte endbr32 */ #endif #ifdef PIC /* Calculate start address in loop for PIC. Due to limitations in some assemblers, Loop-L0-3 cannot be put into the leal */ call L0 CFI_ADJUST_CFA_OFFSET(4) L0: leal (%eax,%eax,8),%eax addl (%esp),%eax addl $(Loop-L0-3),%eax addl $4,%esp CFI_ADJUST_CFA_OFFSET(-4) #else /* Calculate start address in loop for non-PIC. */ leal (Loop - 3)(%eax,%eax,8),%eax #endif #if defined __CET__ && (__CET__ & 1) != 0 addl %ebx,%eax /* Adjust for endbr32 */ #endif jmp *%eax /* jump into loop */ ALIGN (3) Loop: movl (%esi),%eax adcl (%edx),%eax movl %eax,(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 4(%esi),%eax adcl 4(%edx),%eax movl %eax,4(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 8(%esi),%eax adcl 8(%edx),%eax movl %eax,8(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 12(%esi),%eax adcl 12(%edx),%eax movl %eax,12(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 16(%esi),%eax adcl 16(%edx),%eax movl %eax,16(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 20(%esi),%eax adcl 20(%edx),%eax movl %eax,20(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 24(%esi),%eax adcl 24(%edx),%eax movl %eax,24(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 28(%esi),%eax adcl 28(%edx),%eax movl %eax,28(%edi) leal 32(%edi),%edi leal 32(%esi),%esi leal 32(%edx),%edx decl %ecx jnz Loop sbbl %eax,%eax negl %eax #if defined __CET__ && (__CET__ & 1) != 0 popl %ebx CFI_POP(%ebx) #endif popl %esi CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-lshift.S b/mpi/i386/mpih-lshift.S index 55da0678..3404cf55 100644 --- a/mpi/i386/mpih-lshift.S +++ b/mpi/i386/mpih-lshift.S @@ -1,102 +1,102 @@ /* i80386 lshift * Copyright (C) 1992, 1994, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include "sysdep.h" #include "asm-syntax.h" /******************* * mpi_limb_t * _gcry_mpih_lshift( mpi_ptr_t wp, (sp + 4) * mpi_ptr_t up, (sp + 8) * mpi_size_t usize, (sp + 12) * unsigned cnt) (sp + 16) */ .text ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_lshift) C_SYMBOL_NAME(_gcry_mpih_lshift:) CFI_STARTPROC() pushl %edi CFI_PUSH(%edi) pushl %esi CFI_PUSH(%esi) pushl %ebx CFI_PUSH(%ebx) movl 16(%esp),%edi /* res_ptr */ movl 20(%esp),%esi /* s_ptr */ movl 24(%esp),%edx /* size */ movl 28(%esp),%ecx /* cnt */ subl $4,%esi /* adjust s_ptr */ movl (%esi,%edx,4),%ebx /* read most significant limb */ xorl %eax,%eax shldl %cl,%ebx,%eax /* compute carry limb */ decl %edx jz Lend pushl %eax /* push carry limb onto stack */ testb $1,%dl jnz L1 /* enter loop in the middle */ movl %ebx,%eax ALIGN (3) Loop: movl (%esi,%edx,4),%ebx /* load next lower limb */ shldl %cl,%ebx,%eax /* compute result limb */ movl %eax,(%edi,%edx,4) /* store it */ decl %edx L1: movl (%esi,%edx,4),%eax shldl %cl,%eax,%ebx movl %ebx,(%edi,%edx,4) decl %edx jnz Loop shll %cl,%eax /* compute least significant limb */ movl %eax,(%edi) /* store it */ popl %eax /* pop carry limb */ popl %ebx popl %esi popl %edi - ret + ret_spec_stop Lend: shll %cl,%ebx /* compute least significant limb */ movl %ebx,(%edi) /* store it */ popl %ebx CFI_POP(%ebx) popl %esi CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul1.S b/mpi/i386/mpih-mul1.S index 9679ea62..a672d052 100644 --- a/mpi/i386/mpih-mul1.S +++ b/mpi/i386/mpih-mul1.S @@ -1,94 +1,94 @@ /* i80386 mul_1 -- Multiply a limb vector with a limb and store * the result in a second limb vector. * Copyright (C) 1992, 1994, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include "sysdep.h" #include "asm-syntax.h" /******************* * mpi_limb_t * _gcry_mpih_mul_1( mpi_ptr_t res_ptr, (sp + 4) * mpi_ptr_t s1_ptr, (sp + 8) * mpi_size_t s1_size, (sp + 12) * mpi_limb_t s2_limb) (sp + 16) */ #define res_ptr edi #define s1_ptr esi #define size ecx #define s2_limb ebp TEXT ALIGN (3) GLOBL C_SYMBOL_NAME(_gcry_mpih_mul_1) C_SYMBOL_NAME(_gcry_mpih_mul_1:) CFI_STARTPROC() INSN1(push,l ,R(edi)) CFI_PUSH(%edi) INSN1(push,l ,R(esi)) CFI_PUSH(%esi) INSN1(push,l ,R(ebx)) CFI_PUSH(%ebx) INSN1(push,l ,R(ebp)) CFI_PUSH(%ebp) INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) INSN2(mov,l ,R(size),MEM_DISP(esp,28)) INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) INSN1(neg,l ,R(size)) INSN2(xor,l ,R(ebx),R(ebx)) ALIGN (3) Loop: INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) INSN1(mul,l ,R(s2_limb)) INSN2(add,l ,R(eax),R(ebx)) INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(eax)) INSN2(adc,l ,R(edx),$0) INSN2(mov,l ,R(ebx),R(edx)) INSN1(inc,l ,R(size)) INSN1(jnz, ,Loop) INSN2(mov,l ,R(eax),R(ebx)) INSN1(pop,l ,R(ebp)) CFI_POP(%ebp) INSN1(pop,l ,R(ebx)) CFI_POP(%ebx) INSN1(pop,l ,R(esi)) CFI_POP(%esi) INSN1(pop,l ,R(edi)) CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul2.S b/mpi/i386/mpih-mul2.S index fe4129c4..e09c3f7c 100644 --- a/mpi/i386/mpih-mul2.S +++ b/mpi/i386/mpih-mul2.S @@ -1,96 +1,96 @@ /* i80386 addmul_1 -- Multiply a limb vector with a limb and add * the result to a second limb vector. * * Copyright (C) 1992, 1994, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include "sysdep.h" #include "asm-syntax.h" /******************* * mpi_limb_t * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, (sp + 4) * mpi_ptr_t s1_ptr, (sp + 8) * mpi_size_t s1_size, (sp + 12) * mpi_limb_t s2_limb) (sp + 16) */ #define res_ptr edi #define s1_ptr esi #define size ecx #define s2_limb ebp TEXT ALIGN (3) GLOBL C_SYMBOL_NAME(_gcry_mpih_addmul_1) C_SYMBOL_NAME(_gcry_mpih_addmul_1:) CFI_STARTPROC() INSN1(push,l ,R(edi)) CFI_PUSH(%edi) INSN1(push,l ,R(esi)) CFI_PUSH(%esi) INSN1(push,l ,R(ebx)) CFI_PUSH(%ebx) INSN1(push,l ,R(ebp)) CFI_PUSH(%ebp) INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) INSN2(mov,l ,R(size),MEM_DISP(esp,28)) INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) INSN1(neg,l ,R(size)) INSN2(xor,l ,R(ebx),R(ebx)) ALIGN (3) Loop: INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) INSN1(mul,l ,R(s2_limb)) INSN2(add,l ,R(eax),R(ebx)) INSN2(adc,l ,R(edx),$0) INSN2(add,l ,MEM_INDEX(res_ptr,size,4),R(eax)) INSN2(adc,l ,R(edx),$0) INSN2(mov,l ,R(ebx),R(edx)) INSN1(inc,l ,R(size)) INSN1(jnz, ,Loop) INSN2(mov,l ,R(eax),R(ebx)) INSN1(pop,l ,R(ebp)) CFI_POP(%ebp) INSN1(pop,l ,R(ebx)) CFI_POP(%ebx) INSN1(pop,l ,R(esi)) CFI_POP(%esi) INSN1(pop,l ,R(edi)) CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul3.S b/mpi/i386/mpih-mul3.S index 87577d54..4112c699 100644 --- a/mpi/i386/mpih-mul3.S +++ b/mpi/i386/mpih-mul3.S @@ -1,96 +1,96 @@ /* i80386 submul_1 -- Multiply a limb vector with a limb and add * the result to a second limb vector. * * Copyright (C) 1992, 1994, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include "sysdep.h" #include "asm-syntax.h" /******************* * mpi_limb_t * _gcry_mpih_submul_1( mpi_ptr_t res_ptr, (sp + 4) * mpi_ptr_t s1_ptr, (sp + 8) * mpi_size_t s1_size, (sp + 12) * mpi_limb_t s2_limb) (sp + 16) */ #define res_ptr edi #define s1_ptr esi #define size ecx #define s2_limb ebp TEXT ALIGN (3) GLOBL C_SYMBOL_NAME(_gcry_mpih_submul_1) C_SYMBOL_NAME(_gcry_mpih_submul_1:) CFI_STARTPROC() INSN1(push,l ,R(edi)) CFI_PUSH(%edi) INSN1(push,l ,R(esi)) CFI_PUSH(%esi) INSN1(push,l ,R(ebx)) CFI_PUSH(%ebx) INSN1(push,l ,R(ebp)) CFI_PUSH(%ebp) INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) INSN2(mov,l ,R(size),MEM_DISP(esp,28)) INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) INSN1(neg,l ,R(size)) INSN2(xor,l ,R(ebx),R(ebx)) ALIGN (3) Loop: INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) INSN1(mul,l ,R(s2_limb)) INSN2(add,l ,R(eax),R(ebx)) INSN2(adc,l ,R(edx),$0) INSN2(sub,l ,MEM_INDEX(res_ptr,size,4),R(eax)) INSN2(adc,l ,R(edx),$0) INSN2(mov,l ,R(ebx),R(edx)) INSN1(inc,l ,R(size)) INSN1(jnz, ,Loop) INSN2(mov,l ,R(eax),R(ebx)) INSN1(pop,l ,R(ebp)) CFI_POP(%ebp) INSN1(pop,l ,R(ebx)) CFI_POP(%ebx) INSN1(pop,l ,R(esi)) CFI_POP(%esi) INSN1(pop,l ,R(edi)) CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-rshift.S b/mpi/i386/mpih-rshift.S index 35a8201f..5d34696c 100644 --- a/mpi/i386/mpih-rshift.S +++ b/mpi/i386/mpih-rshift.S @@ -1,105 +1,105 @@ /* i80386 rshift * * Copyright (C) 1992, 1994, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include "sysdep.h" #include "asm-syntax.h" /******************* * mpi_limb_t * _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4) * mpi_ptr_t up, (sp + 8) * mpi_size_t usize, (sp + 12) * unsigned cnt) (sp + 16) */ .text ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_rshift) C_SYMBOL_NAME(_gcry_mpih_rshift:) CFI_STARTPROC() pushl %edi CFI_PUSH(%edi) pushl %esi CFI_PUSH(%esi) pushl %ebx CFI_PUSH(%ebx) movl 16(%esp),%edi /* wp */ movl 20(%esp),%esi /* up */ movl 24(%esp),%edx /* usize */ movl 28(%esp),%ecx /* cnt */ leal -4(%edi,%edx,4),%edi leal (%esi,%edx,4),%esi negl %edx movl (%esi,%edx,4),%ebx /* read least significant limb */ xorl %eax,%eax shrdl %cl,%ebx,%eax /* compute carry limb */ incl %edx jz Lend2 pushl %eax /* push carry limb onto stack */ testb $1,%dl jnz L2 /* enter loop in the middle */ movl %ebx,%eax ALIGN (3) Loop2: movl (%esi,%edx,4),%ebx /* load next higher limb */ shrdl %cl,%ebx,%eax /* compute result limb */ movl %eax,(%edi,%edx,4) /* store it */ incl %edx L2: movl (%esi,%edx,4),%eax shrdl %cl,%eax,%ebx movl %ebx,(%edi,%edx,4) incl %edx jnz Loop2 shrl %cl,%eax /* compute most significant limb */ movl %eax,(%edi) /* store it */ popl %eax /* pop carry limb */ popl %ebx popl %esi popl %edi - ret + ret_spec_stop Lend2: shrl %cl,%ebx /* compute most significant limb */ movl %ebx,(%edi) /* store it */ popl %ebx CFI_POP(%ebx) popl %esi CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-sub1.S b/mpi/i386/mpih-sub1.S index 2bdc1438..49477ae3 100644 --- a/mpi/i386/mpih-sub1.S +++ b/mpi/i386/mpih-sub1.S @@ -1,162 +1,162 @@ /* i80386 sub_n -- Sub two limb vectors of the same length > 0 and store * sum in a third limb vector. * * Copyright (C) 1992, 1994, 1995, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include "sysdep.h" #include "asm-syntax.h" /******************* * mpi_limb_t * _gcry_mpih_sub_n( mpi_ptr_t res_ptr, (sp + 4) * mpi_ptr_t s1_ptr, (sp + 8) * mpi_ptr_t s2_ptr, (sp + 12) * mpi_size_t size) (sp + 16) */ .text ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_sub_n) C_SYMBOL_NAME(_gcry_mpih_sub_n:) CFI_STARTPROC() pushl %edi CFI_PUSH(%edi) pushl %esi CFI_PUSH(%esi) movl 12(%esp),%edi /* res_ptr */ movl 16(%esp),%esi /* s1_ptr */ movl 20(%esp),%edx /* s2_ptr */ movl 24(%esp),%ecx /* size */ #if defined __CET__ && (__CET__ & 1) != 0 pushl %ebx CFI_PUSH(%ebx) #endif movl %ecx,%eax shrl $3,%ecx /* compute count for unrolled loop */ negl %eax andl $7,%eax /* get index where to start loop */ jz Loop /* necessary special case for 0 */ incl %ecx /* adjust loop count */ shll $2,%eax /* adjustment for pointers... */ subl %eax,%edi /* ... since they are offset ... */ subl %eax,%esi /* ... by a constant when we ... */ subl %eax,%edx /* ... enter the loop */ shrl $2,%eax /* restore previous value */ #if defined __CET__ && (__CET__ & 1) != 0 leal -4(,%eax,4),%ebx /* Count for 4-byte endbr32 */ #endif #ifdef PIC /* Calculate start address in loop for PIC. Due to limitations in some assemblers, Loop-L0-3 cannot be put into the leal */ call L0 CFI_ADJUST_CFA_OFFSET(4) L0: leal (%eax,%eax,8),%eax addl (%esp),%eax addl $(Loop-L0-3),%eax addl $4,%esp CFI_ADJUST_CFA_OFFSET(-4) #else /* Calculate start address in loop for non-PIC. */ leal (Loop - 3)(%eax,%eax,8),%eax #endif #if defined __CET__ && (__CET__ & 1) != 0 addl %ebx,%eax /* Adjust for endbr32 */ #endif jmp *%eax /* jump into loop */ ALIGN (3) Loop: movl (%esi),%eax sbbl (%edx),%eax movl %eax,(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 4(%esi),%eax sbbl 4(%edx),%eax movl %eax,4(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 8(%esi),%eax sbbl 8(%edx),%eax movl %eax,8(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 12(%esi),%eax sbbl 12(%edx),%eax movl %eax,12(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 16(%esi),%eax sbbl 16(%edx),%eax movl %eax,16(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 20(%esi),%eax sbbl 20(%edx),%eax movl %eax,20(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 24(%esi),%eax sbbl 24(%edx),%eax movl %eax,24(%edi) #ifdef _CET_ENDBR _CET_ENDBR #endif movl 28(%esi),%eax sbbl 28(%edx),%eax movl %eax,28(%edi) leal 32(%edi),%edi leal 32(%esi),%esi leal 32(%edx),%edx decl %ecx jnz Loop sbbl %eax,%eax negl %eax #if defined __CET__ && (__CET__ & 1) != 0 popl %ebx CFI_POP(%ebx) #endif popl %esi CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/syntax.h b/mpi/i386/syntax.h index dd300319..af4d9e80 100644 --- a/mpi/i386/syntax.h +++ b/mpi/i386/syntax.h @@ -1,94 +1,98 @@ /* syntax.h -- Definitions for x86 syntax variations. * * Copyright (C) 1992, 1994, 1995, 1998, * 2001, 2002 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. */ #include #ifdef __i386__ #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES # define CFI_STARTPROC() .cfi_startproc # define CFI_ENDPROC() .cfi_endproc # define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off # define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off # define CFI_RESTORE(reg) .cfi_restore reg # define CFI_PUSH(reg) \ CFI_ADJUST_CFA_OFFSET(4); CFI_REL_OFFSET(reg, 0) # define CFI_POP(reg) \ CFI_ADJUST_CFA_OFFSET(-4); CFI_RESTORE(reg) #else # define CFI_STARTPROC() # define CFI_ENDPROC() # define CFI_ADJUST_CFA_OFFSET(off) # define CFI_REL_OFFSET(reg,off) # define CFI_RESTORE(reg) # define CFI_PUSH(reg) # define CFI_POP(reg) #endif #endif #undef ALIGN #if defined (BSD_SYNTAX) || defined (ELF_SYNTAX) #define R(r) %r #define MEM(base)(base) #define MEM_DISP(base,displacement)displacement(R(base)) #define MEM_INDEX(base,index,size)(R(base),R(index),size) #ifdef __STDC__ #define INSN1(mnemonic,size_suffix,dst)mnemonic##size_suffix dst #define INSN2(mnemonic,size_suffix,dst,src)mnemonic##size_suffix src,dst #else #define INSN1(mnemonic,size_suffix,dst)mnemonic/**/size_suffix dst #define INSN2(mnemonic,size_suffix,dst,src)mnemonic/**/size_suffix src,dst #endif #define TEXT .text #if defined (BSD_SYNTAX) #define ALIGN(log) .align log #endif #if defined (ELF_SYNTAX) #define ALIGN(log) .align 1<<(log) #endif #define GLOBL .globl #endif #ifdef INTEL_SYNTAX #define R(r) r #define MEM(base)[base] #define MEM_DISP(base,displacement)[base+(displacement)] #define MEM_INDEX(base,index,size)[base+index*size] #define INSN1(mnemonic,size_suffix,dst)mnemonic dst #define INSN2(mnemonic,size_suffix,dst,src)mnemonic dst,src #define TEXT .text #define ALIGN(log) .align log #define GLOBL .globl #endif #ifdef X86_BROKEN_ALIGN #undef ALIGN #define ALIGN(log) .align log,0x90 #endif + +/* 'ret' instruction replacement for straight-line speculation mitigation */ +#define ret_spec_stop \ + ret; int3;