diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 08baa7c4..a24b117c 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -1,130 +1,129 @@ # Makefile for cipher modules # Copyright (C) 1998, 1999, 2000, 2001, 2002, # 2003, 2009 Free Software Foundation, Inc. # # This file is part of Libgcrypt. # # Libgcrypt is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of # the License, or (at your option) any later version. # # Libgcrypt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, see . # Process this file with automake to produce Makefile.in # Need to include ../src in addition to top_srcdir because gcrypt.h is # a built header. AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi AM_CFLAGS = $(GPG_ERROR_CFLAGS) AM_CCASFLAGS = $(NOEXECSTACK_FLAGS) EXTRA_DIST = gost-s-box.c CLEANFILES = gost-s-box DISTCLEANFILES = gost-sb.h noinst_LTLIBRARIES = libcipher.la GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \ @GCRYPT_DIGESTS@ @GCRYPT_KDFS@ libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES) libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \ cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \ cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ cipher-poly1305.c cipher-ocb.c cipher-xts.c \ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ mac.c mac-internal.h \ mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \ poly1305.c poly1305-internal.h \ kdf.c kdf-internal.h \ hmac-tests.c \ bithelp.h \ bufhelp.h \ primegen.c \ hash-common.c hash-common.h \ dsa-common.c rsa-common.c \ sha1.h EXTRA_libcipher_la_SOURCES = \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ -chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ - chacha20-armv7-neon.S \ +chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S chacha20-armv7-neon.S \ crc.c \ crc-intel-pclmul.c \ des.c des-amd64.S \ dsa.c \ elgamal.c \ ecc.c ecc-curves.c ecc-misc.c ecc-common.h \ ecc-ecdsa.c ecc-eddsa.c ecc-gost.c \ idea.c \ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \ rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \ rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \ rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S rijndael-armv8-aarch64-ce.S \ rijndael-aarch64.S \ rmd160.c \ rsa.c \ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S sha512-arm.S \ sm3.c \ keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c whirlpool-sse2-amd64.S \ twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \ twofish-avx2-amd64.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \ blake2.c gost28147.lo: gost-sb.h gost-sb.h: gost-s-box ./gost-s-box $@ gost-s-box: gost-s-box.c $(CC_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c if ENABLE_O_FLAG_MUNGING o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g' else o_flag_munging = cat endif # We need to lower the optimization for this module. tiger.o: $(srcdir)/tiger.c `echo $(COMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) ` tiger.lo: $(srcdir)/tiger.c `echo $(LTCOMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) ` diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S new file mode 100644 index 00000000..dad9e3e9 --- /dev/null +++ b/cipher/chacha20-amd64-avx2.S @@ -0,0 +1,323 @@ +/* chacha20-amd64-avx2.S - AVX2 implementation of ChaCha20 cipher + * + + * Copyright (C) 2017,2018 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on D. J. Bernstein reference implementation at + * http://cr.yp.to/chacha.html: + * + * chacha-regs.c version 20080118 + * D. J. Bernstein + * Public domain. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_GCC_INLINE_ASM_AVX2) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +.text + +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + +/* register macros */ +#define INPUT %rdi +#define DST %rsi +#define SRC %rdx +#define NBLKS %rcx +#define ROUND %eax + +/* stack structure */ +#define STACK_VEC_X12 (32) +#define STACK_VEC_X13 (32 + STACK_VEC_X12) +#define STACK_TMP (32 + STACK_VEC_X13) +#define STACK_TMP1 (32 + STACK_TMP) +#define STACK_TMP2 (32 + STACK_TMP1) + +#define STACK_MAX (32 + STACK_TMP2) + +/* vector registers */ +#define X0 %ymm0 +#define X1 %ymm1 +#define X2 %ymm2 +#define X3 %ymm3 +#define X4 %ymm4 +#define X5 %ymm5 +#define X6 %ymm6 +#define X7 %ymm7 +#define X8 %ymm8 +#define X9 %ymm9 +#define X10 %ymm10 +#define X11 %ymm11 +#define X12 %ymm12 +#define X13 %ymm13 +#define X14 %ymm14 +#define X15 %ymm15 + +#define X0h %xmm0 +#define X1h %xmm1 +#define X2h %xmm2 +#define X3h %xmm3 +#define X4h %xmm4 +#define X5h %xmm5 +#define X6h %xmm6 +#define X7h %xmm7 +#define X8h %xmm8 +#define X9h %xmm9 +#define X10h %xmm10 +#define X11h %xmm11 +#define X12h %xmm12 +#define X13h %xmm13 +#define X14h %xmm14 +#define X15h %xmm15 + +/********************************************************************** + helper macros + **********************************************************************/ + +/* 4x4 32-bit integer matrix transpose */ +#define transpose_4x4(x0,x1,x2,x3,t1,t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/********************************************************************** + 8-way chacha20 + **********************************************************************/ + +#define ROTATE2(v1,v2,c,tmp) \ + vpsrld $(32 - (c)), v1, tmp; \ + vpslld $(c), v1, v1; \ + vpaddb tmp, v1, v1; \ + vpsrld $(32 - (c)), v2, tmp; \ + vpslld $(c), v2, v2; \ + vpaddb tmp, v2, v2; + +#define ROTATE_SHUF_2(v1,v2,shuf) \ + vpshufb shuf, v1, v1; \ + vpshufb shuf, v2, v2; + +#define XOR(ds,s) \ + vpxor s, ds, ds; + +#define PLUS(ds,s) \ + vpaddd s, ds, ds; + +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1) \ + vbroadcasti128 .Lshuf_rol16 RIP, tmp1; \ + PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ + ROTATE_SHUF_2(d1, d2, tmp1); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ + ROTATE2(b1, b2, 12, tmp1); \ + vbroadcasti128 .Lshuf_rol8 RIP, tmp1; \ + PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ + ROTATE_SHUF_2(d1, d2, tmp1); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ + ROTATE2(b1, b2, 7, tmp1); + +#define BUF_XOR_256_TO_128(dst, src, offset_lo, offset_hi, yreg, tmp1) \ + vextracti128 $1, yreg, tmp1##h; \ + vpxor offset_lo(src), yreg##h, yreg##h; \ + vpxor offset_hi(src), tmp1##h, tmp1##h; \ + vmovdqu yreg##h, offset_lo(dst); \ + vmovdqu tmp1##h, offset_hi(dst); + +.align 32 +chacha20_data: +.Lshuf_rol16: + .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.Lshuf_rol8: + .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.Linc_counter: + .byte 0,1,2,3,4,5,6,7 +.Lunsigned_cmp: + .long 0x80000000 + +.align 8 +.globl _gcry_chacha20_amd64_avx2_blocks8 +ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;) + +_gcry_chacha20_amd64_avx2_blocks8: + /* input: + * %rdi: input + * %rsi: dst + * %rdx: src + * %rcx: nblks (multiple of 8) + */ + + vzeroupper; + + pushq %rbp; + movq %rsp, %rbp; + + subq $STACK_MAX, %rsp; + andq $~31, %rsp; + +.Loop4: + mov $20, ROUND; + + /* Construct counter vectors X12 and X13 */ + vpmovzxbd .Linc_counter RIP, X0; + vpbroadcastd .Lunsigned_cmp RIP, X2; + vpbroadcastd (12 * 4)(INPUT), X12; + vpbroadcastd (13 * 4)(INPUT), X13; + vpaddd X0, X12, X12; + vpxor X2, X0, X0; + vpxor X2, X12, X1; + vpcmpgtd X1, X0, X0; + vpsubd X0, X13, X13; + vmovdqa X12, (STACK_VEC_X12)(%rsp); + vmovdqa X13, (STACK_VEC_X13)(%rsp); + + /* Load vectors */ + vpbroadcastd (0 * 4)(INPUT), X0; + vpbroadcastd (1 * 4)(INPUT), X1; + vpbroadcastd (2 * 4)(INPUT), X2; + vpbroadcastd (3 * 4)(INPUT), X3; + vpbroadcastd (4 * 4)(INPUT), X4; + vpbroadcastd (5 * 4)(INPUT), X5; + vpbroadcastd (6 * 4)(INPUT), X6; + vpbroadcastd (7 * 4)(INPUT), X7; + vpbroadcastd (8 * 4)(INPUT), X8; + vpbroadcastd (9 * 4)(INPUT), X9; + vpbroadcastd (10 * 4)(INPUT), X10; + vpbroadcastd (11 * 4)(INPUT), X11; + vpbroadcastd (14 * 4)(INPUT), X14; + vpbroadcastd (15 * 4)(INPUT), X15; + vmovdqa X15, (STACK_TMP)(%rsp); + +.Lround2: + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15) + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X8, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8) + vmovdqa (STACK_TMP)(%rsp), X8; + vmovdqa X15, (STACK_TMP)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15) + sub $2, ROUND; + jnz .Lround2; + + /* tmp := X15 */ + vpbroadcastd (0 * 4)(INPUT), X15; + PLUS(X0, X15); + vpbroadcastd (1 * 4)(INPUT), X15; + PLUS(X1, X15); + vpbroadcastd (2 * 4)(INPUT), X15; + PLUS(X2, X15); + vpbroadcastd (3 * 4)(INPUT), X15; + PLUS(X3, X15); + vpbroadcastd (4 * 4)(INPUT), X15; + PLUS(X4, X15); + vpbroadcastd (5 * 4)(INPUT), X15; + PLUS(X5, X15); + vpbroadcastd (6 * 4)(INPUT), X15; + PLUS(X6, X15); + vpbroadcastd (7 * 4)(INPUT), X15; + PLUS(X7, X15); + vpbroadcastd (8 * 4)(INPUT), X15; + PLUS(X8, X15); + vpbroadcastd (9 * 4)(INPUT), X15; + PLUS(X9, X15); + vpbroadcastd (10 * 4)(INPUT), X15; + PLUS(X10, X15); + vpbroadcastd (11 * 4)(INPUT), X15; + PLUS(X11, X15); + vmovdqa (STACK_VEC_X12)(%rsp), X15; + PLUS(X12, X15); + vmovdqa (STACK_VEC_X13)(%rsp), X15; + PLUS(X13, X15); + vmovdqa (STACK_TMP)(%rsp), X15; + vmovdqa X13, (STACK_TMP)(%rsp); + vpbroadcastd (14 * 4)(INPUT), X13; + PLUS(X14, X13); + vmovdqa X14, (STACK_TMP1)(%rsp); + vpbroadcastd (15 * 4)(INPUT), X13; + PLUS(X15, X13); + vmovdqa X15, (STACK_TMP2)(%rsp); + + /* Update counter */ + addq $8, (12 * 4)(INPUT); + + transpose_4x4(X0, X1, X2, X3, X13, X14); + transpose_4x4(X4, X5, X6, X7, X13, X14); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15); + vmovdqa (STACK_TMP)(%rsp), X13; + vmovdqa (STACK_TMP1)(%rsp), X14; + vmovdqa (STACK_TMP2)(%rsp), X15; + transpose_4x4(X8, X9, X10, X11, X0, X1); + transpose_4x4(X12, X13, X14, X15, X0, X1); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0); + BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0); + + sub $8, NBLKS; + lea (8 * 64)(DST), DST; + lea (8 * 64)(SRC), SRC; + jnz .Loop4; + + /* clear the used vector registers and stack */ + vpxor X0, X0, X0; + vmovdqa X0, (STACK_VEC_X12)(%rsp); + vmovdqa X0, (STACK_VEC_X13)(%rsp); + vmovdqa X0, (STACK_TMP)(%rsp); + vmovdqa X0, (STACK_TMP1)(%rsp); + vmovdqa X0, (STACK_TMP2)(%rsp); + vzeroall; + + /* eax zeroed by round loop. */ + leave; + ret; +ELF(.size _gcry_chacha20_amd64_avx2_blocks8, + .-_gcry_chacha20_amd64_avx2_blocks8;) + +#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ +#endif /*__x86_64*/ diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S new file mode 100644 index 00000000..7ad1c0ae --- /dev/null +++ b/cipher/chacha20-amd64-ssse3.S @@ -0,0 +1,341 @@ +/* chacha20-amd64-ssse3.S - SSSE3 implementation of ChaCha20 cipher + * + * Copyright (C) 2017,2018 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on D. J. Bernstein reference implementation at + * http://cr.yp.to/chacha.html: + * + * chacha-regs.c version 20080118 + * D. J. Bernstein + * Public domain. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +.text + +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + +/* register macros */ +#define INPUT %rdi +#define DST %rsi +#define SRC %rdx +#define NBLKS %rcx +#define ROUND %eax + +/* stack structure */ +#define STACK_VEC_X12 (16) +#define STACK_VEC_X13 (16 + STACK_VEC_X12) +#define STACK_TMP (16 + STACK_VEC_X13) +#define STACK_TMP1 (16 + STACK_TMP) +#define STACK_TMP2 (16 + STACK_TMP1) + +#define STACK_MAX (16 + STACK_TMP2) + +/* vector registers */ +#define X0 %xmm0 +#define X1 %xmm1 +#define X2 %xmm2 +#define X3 %xmm3 +#define X4 %xmm4 +#define X5 %xmm5 +#define X6 %xmm6 +#define X7 %xmm7 +#define X8 %xmm8 +#define X9 %xmm9 +#define X10 %xmm10 +#define X11 %xmm11 +#define X12 %xmm12 +#define X13 %xmm13 +#define X14 %xmm14 +#define X15 %xmm15 + +/********************************************************************** + helper macros + **********************************************************************/ + +/* 4x4 32-bit integer matrix transpose */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ + movdqa x0, t2; \ + punpckhdq x1, t2; \ + punpckldq x1, x0; \ + \ + movdqa x2, t1; \ + punpckldq x3, t1; \ + punpckhdq x3, x2; \ + \ + movdqa x0, x1; \ + punpckhqdq t1, x1; \ + punpcklqdq t1, x0; \ + \ + movdqa t2, x3; \ + punpckhqdq x2, x3; \ + punpcklqdq x2, t2; \ + movdqa t2, x2; + +/* fill xmm register with 32-bit value from memory */ +#define pbroadcastd(mem32, xreg) \ + movd mem32, xreg; \ + pshufd $0, xreg, xreg; + +/* xor with unaligned memory operand */ +#define pxor_u(umem128, xreg, t) \ + movdqu umem128, t; \ + pxor t, xreg; + +/* xor register with unaligned src and save to unaligned dst */ +#define xor_src_dst(dst, src, offset, xreg, t) \ + pxor_u(offset(src), xreg, t); \ + movdqu xreg, offset(dst); + +#define clear(x) pxor x,x; + +/********************************************************************** + 4-way chacha20 + **********************************************************************/ + +#define ROTATE2(v1,v2,c,tmp1,tmp2) \ + movdqa v1, tmp1; \ + movdqa v2, tmp2; \ + psrld $(32 - (c)), v1; \ + pslld $(c), tmp1; \ + paddb tmp1, v1; \ + psrld $(32 - (c)), v2; \ + pslld $(c), tmp2; \ + paddb tmp2, v2; + +#define ROTATE_SHUF_2(v1,v2,shuf) \ + pshufb shuf, v1; \ + pshufb shuf, v2; + +#define XOR(ds,s) \ + pxor s, ds; + +#define PLUS(ds,s) \ + paddd s, ds; + +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \ + movdqa .Lshuf_rol16 RIP, tmp1; \ + PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ + ROTATE_SHUF_2(d1, d2, tmp1); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ + ROTATE2(b1, b2, 12, tmp1, tmp2); \ + movdqa .Lshuf_rol8 RIP, tmp1; \ + PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ + ROTATE_SHUF_2(d1, d2, tmp1); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ + ROTATE2(b1, b2, 7, tmp1, tmp2); + +chacha20_data: +.align 16 +.Lshuf_rol16: + .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.Lshuf_rol8: + .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.Linc_counter: + .long 0,1,2,3 +.Lunsigned_cmp: + .long 0x80000000,0x80000000,0x80000000,0x80000000 + +.align 8 +.globl _gcry_chacha20_amd64_ssse3_blocks4 +ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;) + +_gcry_chacha20_amd64_ssse3_blocks4: + /* input: + * %rdi: input + * %rsi: dst + * %rdx: src + * %rcx: nblks (multiple of 4) + */ + + pushq %rbp; + movq %rsp, %rbp; + + subq $STACK_MAX, %rsp; + andq $~15, %rsp; + +.Loop4: + mov $20, ROUND; + + /* Construct counter vectors X12 and X13 */ + vmovdqa .Linc_counter RIP, X0; + vmovdqa .Lunsigned_cmp RIP, X2; + pbroadcastd((12 * 4)(INPUT), X12); + pbroadcastd((13 * 4)(INPUT), X13); + paddd X0, X12; + movdqa X12, X1; + pxor X2, X0; + pxor X2, X1; + pcmpgtd X1, X0; + psubd X0, X13; + movdqa X12, (STACK_VEC_X12)(%rsp); + movdqa X13, (STACK_VEC_X13)(%rsp); + + /* Load vectors */ + pbroadcastd((0 * 4)(INPUT), X0); + pbroadcastd((1 * 4)(INPUT), X1); + pbroadcastd((2 * 4)(INPUT), X2); + pbroadcastd((3 * 4)(INPUT), X3); + pbroadcastd((4 * 4)(INPUT), X4); + pbroadcastd((5 * 4)(INPUT), X5); + pbroadcastd((6 * 4)(INPUT), X6); + pbroadcastd((7 * 4)(INPUT), X7); + pbroadcastd((8 * 4)(INPUT), X8); + pbroadcastd((9 * 4)(INPUT), X9); + pbroadcastd((10 * 4)(INPUT), X10); + pbroadcastd((11 * 4)(INPUT), X11); + pbroadcastd((14 * 4)(INPUT), X14); + pbroadcastd((15 * 4)(INPUT), X15); + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + +.Lround2: + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15) + movdqa (STACK_TMP)(%rsp), X11; + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X8, (STACK_TMP)(%rsp); + movdqa X9, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9) + movdqa (STACK_TMP)(%rsp), X8; + movdqa (STACK_TMP1)(%rsp), X9; + movdqa X11, (STACK_TMP)(%rsp); + movdqa X15, (STACK_TMP1)(%rsp); + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15) + sub $2, ROUND; + jnz .Lround2; + + /* tmp := X15 */ + movdqa (STACK_TMP)(%rsp), X11; + pbroadcastd((0 * 4)(INPUT), X15); + PLUS(X0, X15); + pbroadcastd((1 * 4)(INPUT), X15); + PLUS(X1, X15); + pbroadcastd((2 * 4)(INPUT), X15); + PLUS(X2, X15); + pbroadcastd((3 * 4)(INPUT), X15); + PLUS(X3, X15); + pbroadcastd((4 * 4)(INPUT), X15); + PLUS(X4, X15); + pbroadcastd((5 * 4)(INPUT), X15); + PLUS(X5, X15); + pbroadcastd((6 * 4)(INPUT), X15); + PLUS(X6, X15); + pbroadcastd((7 * 4)(INPUT), X15); + PLUS(X7, X15); + pbroadcastd((8 * 4)(INPUT), X15); + PLUS(X8, X15); + pbroadcastd((9 * 4)(INPUT), X15); + PLUS(X9, X15); + pbroadcastd((10 * 4)(INPUT), X15); + PLUS(X10, X15); + pbroadcastd((11 * 4)(INPUT), X15); + PLUS(X11, X15); + movdqa (STACK_VEC_X12)(%rsp), X15; + PLUS(X12, X15); + movdqa (STACK_VEC_X13)(%rsp), X15; + PLUS(X13, X15); + movdqa X13, (STACK_TMP)(%rsp); + pbroadcastd((14 * 4)(INPUT), X15); + PLUS(X14, X15); + movdqa (STACK_TMP1)(%rsp), X15; + movdqa X14, (STACK_TMP1)(%rsp); + pbroadcastd((15 * 4)(INPUT), X13); + PLUS(X15, X13); + movdqa X15, (STACK_TMP2)(%rsp); + + /* Update counter */ + addq $4, (12 * 4)(INPUT); + + transpose_4x4(X0, X1, X2, X3, X13, X14, X15); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15); + transpose_4x4(X4, X5, X6, X7, X0, X1, X2); + movdqa (STACK_TMP)(%rsp), X13; + movdqa (STACK_TMP1)(%rsp), X14; + movdqa (STACK_TMP2)(%rsp), X15; + xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0); + transpose_4x4(X8, X9, X10, X11, X0, X1, X2); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0); + transpose_4x4(X12, X13, X14, X15, X0, X1, X2); + xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0); + xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0); + xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0); + xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0); + + sub $4, NBLKS; + lea (4 * 64)(DST), DST; + lea (4 * 64)(SRC), SRC; + jnz .Loop4; + + /* clear the used vector registers and stack */ + clear(X0); + movdqa X0, (STACK_VEC_X12)(%rsp); + movdqa X0, (STACK_VEC_X13)(%rsp); + movdqa X0, (STACK_TMP)(%rsp); + movdqa X0, (STACK_TMP1)(%rsp); + movdqa X0, (STACK_TMP2)(%rsp); + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X8); + clear(X9); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + clear(X14); + clear(X15); + + /* eax zeroed by round loop. */ + leave; + ret; +ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, + .-_gcry_chacha20_amd64_ssse3_blocks4;) + +#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ +#endif /*__x86_64*/ diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S index c1971fc7..33a43df1 100644 --- a/cipher/chacha20-armv7-neon.S +++ b/cipher/chacha20-armv7-neon.S @@ -1,750 +1,393 @@ -/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function +/* chacha20-armv7-neon.S - ARMv7 NEON implementation of ChaCha20 cipher * - * Copyright (C) 2014 Jussi Kivilinna + * Copyright (C) 2017,2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* - * Based on public domain implementation by Andrew Moon at - * https://github.com/floodyberry/chacha-opt + * Based on D. J. Bernstein reference implementation at + * http://cr.yp.to/chacha.html: + * + * chacha-regs.c version 20080118 + * D. J. Bernstein + * Public domain. */ #include #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ - defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20) + defined(HAVE_GCC_INLINE_ASM_NEON) .syntax unified .fpu neon .arm -#define UNALIGNED_STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \ - tst ptr, #3; \ - beq 1f; \ - vpush {d0-d3}; \ - vmov s0, l0; \ - vmov s1, l1; \ - vmov s2, l2; \ - vmov s3, l3; \ - vmov s4, l4; \ - vmov s5, l5; \ - vmov s6, l6; \ - vmov s7, l7; \ - vst1.32 {d0-d3}, [ptr]; \ - add ptr, #32; \ - vpop {d0-d3}; \ - b 2f; \ - 1: stmia ptr!, {l0-l7}; \ - 2: ; - -#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \ - tst ptr, #3; \ - beq 1f; \ - vpush {d0-d1}; \ - vld1.32 {d0-d1}, [ptr]; \ - add ptr, #16; \ - vmov l0, s0; \ - vmov l1, s1; \ - vmov l2, s2; \ - vmov l3, s3; \ - vpop {d0-d1}; \ - b 2f; \ - 1: ldmia ptr!, {l0-l3}; \ - 2: ; - .text -.globl _gcry_chacha20_armv7_neon_blocks -.type _gcry_chacha20_armv7_neon_blocks,%function; -_gcry_chacha20_armv7_neon_blocks: -.Lchacha_blocks_neon_local: - tst r3, r3 - beq .Lchacha_blocks_neon_nobytes - vstmdb sp!, {q4,q5,q6,q7} - stmfd sp!, {r4-r12, r14} - mov r8, sp - sub sp, sp, #196 - and sp, sp, #0xffffffe0 - str r0, [sp, #60] - str r1, [sp, #48] - str r2, [sp, #40] - str r3, [sp, #52] - str r8, [sp, #192] - add r1, sp, #64 - ldmia r0!, {r4-r11} - stmia r1!, {r4-r11} - ldmia r0!, {r4-r11} - stmia r1!, {r4-r11} - mov r4, #20 - str r4, [sp, #44] - cmp r3, #256 - blo .Lchacha_blocks_neon_mainloop2 -.Lchacha_blocks_neon_mainloop1: - ldr r0, [sp, #44] - str r0, [sp, #0] - add r1, sp, #(64) - mov r2, #1 - veor q12, q12 - vld1.32 {q0,q1}, [r1,:128]! - vld1.32 {q2,q3}, [r1,:128] - vmov.32 d24[0], r2 - vadd.u64 q3, q3, q12 - vmov q4, q0 - vmov q5, q1 - vmov q6, q2 - vadd.u64 q7, q3, q12 - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vadd.u64 q11, q7, q12 - add r0, sp, #64 - ldm r0, {r0-r12} - ldr r14, [sp, #(64 +60)] - str r6, [sp, #8] - str r11, [sp, #12] - str r14, [sp, #28] - ldr r11, [sp, #(64 +52)] - ldr r14, [sp, #(64 +56)] -.Lchacha_blocks_neon_rounds1: - ldr r6, [sp, #0] - vadd.i32 q0, q0, q1 - add r0, r0, r4 - vadd.i32 q4, q4, q5 - add r1, r1, r5 - vadd.i32 q8, q8, q9 - eor r12, r12, r0 - veor q12, q3, q0 - eor r11, r11, r1 - veor q13, q7, q4 - ror r12, r12, #16 - veor q14, q11, q8 - ror r11, r11, #16 - vrev32.16 q3, q12 - subs r6, r6, #2 - vrev32.16 q7, q13 - add r8, r8, r12 - vrev32.16 q11, q14 - add r9, r9, r11 - vadd.i32 q2, q2, q3 - eor r4, r4, r8 - vadd.i32 q6, q6, q7 - eor r5, r5, r9 - vadd.i32 q10, q10, q11 - str r6, [sp, #0] - veor q12, q1, q2 - ror r4, r4, #20 - veor q13, q5, q6 - ror r5, r5, #20 - veor q14, q9, q10 - add r0, r0, r4 - vshl.i32 q1, q12, #12 - add r1, r1, r5 - vshl.i32 q5, q13, #12 - ldr r6, [sp, #8] - vshl.i32 q9, q14, #12 - eor r12, r12, r0 - vsri.u32 q1, q12, #20 - eor r11, r11, r1 - vsri.u32 q5, q13, #20 - ror r12, r12, #24 - vsri.u32 q9, q14, #20 - ror r11, r11, #24 - vadd.i32 q0, q0, q1 - add r8, r8, r12 - vadd.i32 q4, q4, q5 - add r9, r9, r11 - vadd.i32 q8, q8, q9 - eor r4, r4, r8 - veor q12, q3, q0 - eor r5, r5, r9 - veor q13, q7, q4 - str r11, [sp, #20] - veor q14, q11, q8 - ror r4, r4, #25 - vshl.i32 q3, q12, #8 - ror r5, r5, #25 - vshl.i32 q7, q13, #8 - str r4, [sp, #4] - vshl.i32 q11, q14, #8 - ldr r4, [sp, #28] - vsri.u32 q3, q12, #24 - add r2, r2, r6 - vsri.u32 q7, q13, #24 - add r3, r3, r7 - vsri.u32 q11, q14, #24 - ldr r11, [sp, #12] - vadd.i32 q2, q2, q3 - eor r14, r14, r2 - vadd.i32 q6, q6, q7 - eor r4, r4, r3 - vadd.i32 q10, q10, q11 - ror r14, r14, #16 - veor q12, q1, q2 - ror r4, r4, #16 - veor q13, q5, q6 - add r10, r10, r14 - veor q14, q9, q10 - add r11, r11, r4 - vshl.i32 q1, q12, #7 - eor r6, r6, r10 - vshl.i32 q5, q13, #7 - eor r7, r7, r11 - vshl.i32 q9, q14, #7 - ror r6, r6, #20 - vsri.u32 q1, q12, #25 - ror r7, r7, #20 - vsri.u32 q5, q13, #25 - add r2, r2, r6 - vsri.u32 q9, q14, #25 - add r3, r3, r7 - vext.32 q3, q3, q3, #3 - eor r14, r14, r2 - vext.32 q7, q7, q7, #3 - eor r4, r4, r3 - vext.32 q11, q11, q11, #3 - ror r14, r14, #24 - vext.32 q1, q1, q1, #1 - ror r4, r4, #24 - vext.32 q5, q5, q5, #1 - add r10, r10, r14 - vext.32 q9, q9, q9, #1 - add r11, r11, r4 - vext.32 q2, q2, q2, #2 - eor r6, r6, r10 - vext.32 q6, q6, q6, #2 - eor r7, r7, r11 - vext.32 q10, q10, q10, #2 - ror r6, r6, #25 - vadd.i32 q0, q0, q1 - ror r7, r7, #25 - vadd.i32 q4, q4, q5 - add r0, r0, r5 - vadd.i32 q8, q8, q9 - add r1, r1, r6 - veor q12, q3, q0 - eor r4, r4, r0 - veor q13, q7, q4 - eor r12, r12, r1 - veor q14, q11, q8 - ror r4, r4, #16 - vrev32.16 q3, q12 - ror r12, r12, #16 - vrev32.16 q7, q13 - add r10, r10, r4 - vrev32.16 q11, q14 - add r11, r11, r12 - vadd.i32 q2, q2, q3 - eor r5, r5, r10 - vadd.i32 q6, q6, q7 - eor r6, r6, r11 - vadd.i32 q10, q10, q11 - ror r5, r5, #20 - veor q12, q1, q2 - ror r6, r6, #20 - veor q13, q5, q6 - add r0, r0, r5 - veor q14, q9, q10 - add r1, r1, r6 - vshl.i32 q1, q12, #12 - eor r4, r4, r0 - vshl.i32 q5, q13, #12 - eor r12, r12, r1 - vshl.i32 q9, q14, #12 - ror r4, r4, #24 - vsri.u32 q1, q12, #20 - ror r12, r12, #24 - vsri.u32 q5, q13, #20 - add r10, r10, r4 - vsri.u32 q9, q14, #20 - add r11, r11, r12 - vadd.i32 q0, q0, q1 - eor r5, r5, r10 - vadd.i32 q4, q4, q5 - eor r6, r6, r11 - vadd.i32 q8, q8, q9 - str r11, [sp, #12] - veor q12, q3, q0 - ror r5, r5, #25 - veor q13, q7, q4 - ror r6, r6, #25 - veor q14, q11, q8 - str r4, [sp, #28] - vshl.i32 q3, q12, #8 - ldr r4, [sp, #4] - vshl.i32 q7, q13, #8 - add r2, r2, r7 - vshl.i32 q11, q14, #8 - add r3, r3, r4 - vsri.u32 q3, q12, #24 - ldr r11, [sp, #20] - vsri.u32 q7, q13, #24 - eor r11, r11, r2 - vsri.u32 q11, q14, #24 - eor r14, r14, r3 - vadd.i32 q2, q2, q3 - ror r11, r11, #16 - vadd.i32 q6, q6, q7 - ror r14, r14, #16 - vadd.i32 q10, q10, q11 - add r8, r8, r11 - veor q12, q1, q2 - add r9, r9, r14 - veor q13, q5, q6 - eor r7, r7, r8 - veor q14, q9, q10 - eor r4, r4, r9 - vshl.i32 q1, q12, #7 - ror r7, r7, #20 - vshl.i32 q5, q13, #7 - ror r4, r4, #20 - vshl.i32 q9, q14, #7 - str r6, [sp, #8] - vsri.u32 q1, q12, #25 - add r2, r2, r7 - vsri.u32 q5, q13, #25 - add r3, r3, r4 - vsri.u32 q9, q14, #25 - eor r11, r11, r2 - vext.32 q3, q3, q3, #1 - eor r14, r14, r3 - vext.32 q7, q7, q7, #1 - ror r11, r11, #24 - vext.32 q11, q11, q11, #1 - ror r14, r14, #24 - vext.32 q1, q1, q1, #3 - add r8, r8, r11 - vext.32 q5, q5, q5, #3 - add r9, r9, r14 - vext.32 q9, q9, q9, #3 - eor r7, r7, r8 - vext.32 q2, q2, q2, #2 - eor r4, r4, r9 - vext.32 q6, q6, q6, #2 - ror r7, r7, #25 - vext.32 q10, q10, q10, #2 - ror r4, r4, #25 - bne .Lchacha_blocks_neon_rounds1 - str r8, [sp, #0] - str r9, [sp, #4] - str r10, [sp, #8] - str r12, [sp, #16] - str r11, [sp, #20] - str r14, [sp, #24] - add r9, sp, #64 - vld1.32 {q12,q13}, [r9,:128]! - ldr r12, [sp, #48] - vld1.32 {q14,q15}, [r9,:128] - ldr r14, [sp, #40] - vadd.i32 q0, q0, q12 - ldr r8, [sp, #(64 +0)] - vadd.i32 q4, q4, q12 - ldr r9, [sp, #(64 +4)] - vadd.i32 q8, q8, q12 - ldr r10, [sp, #(64 +8)] - vadd.i32 q1, q1, q13 - ldr r11, [sp, #(64 +12)] - vadd.i32 q5, q5, q13 - add r0, r0, r8 - vadd.i32 q9, q9, q13 - add r1, r1, r9 - vadd.i32 q2, q2, q14 - add r2, r2, r10 - vadd.i32 q6, q6, q14 - ldr r8, [sp, #(64 +16)] - vadd.i32 q10, q10, q14 - add r3, r3, r11 - veor q14, q14, q14 - ldr r9, [sp, #(64 +20)] - mov r11, #1 - add r4, r4, r8 - vmov.32 d28[0], r11 - ldr r10, [sp, #(64 +24)] - vadd.u64 q12, q14, q15 - add r5, r5, r9 - vadd.u64 q13, q14, q12 - ldr r11, [sp, #(64 +28)] - vadd.u64 q14, q14, q13 - add r6, r6, r10 - vadd.i32 q3, q3, q12 - tst r12, r12 - vadd.i32 q7, q7, q13 - add r7, r7, r11 - vadd.i32 q11, q11, q14 - beq .Lchacha_blocks_neon_nomessage11 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage11: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - tst r12, r12 - ldm sp, {r0-r7} - ldr r8, [sp, #(64 +32)] - ldr r9, [sp, #(64 +36)] - ldr r10, [sp, #(64 +40)] - ldr r11, [sp, #(64 +44)] - add r0, r0, r8 - add r1, r1, r9 - add r2, r2, r10 - ldr r8, [sp, #(64 +48)] - add r3, r3, r11 - ldr r9, [sp, #(64 +52)] - add r4, r4, r8 - ldr r10, [sp, #(64 +56)] - add r5, r5, r9 - ldr r11, [sp, #(64 +60)] - add r6, r6, r10 - adds r8, r8, #4 - add r7, r7, r11 - adc r9, r9, #0 - str r8, [sp, #(64 +48)] - tst r12, r12 - str r9, [sp, #(64 +52)] - beq .Lchacha_blocks_neon_nomessage12 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage12: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - tst r12, r12 - beq .Lchacha_blocks_neon_nomessage13 - vld1.32 {q12,q13}, [r12]! - vld1.32 {q14,q15}, [r12]! - veor q0, q0, q12 - veor q1, q1, q13 - veor q2, q2, q14 - veor q3, q3, q15 -.Lchacha_blocks_neon_nomessage13: - vst1.32 {q0,q1}, [r14]! - vst1.32 {q2,q3}, [r14]! - beq .Lchacha_blocks_neon_nomessage14 - vld1.32 {q12,q13}, [r12]! - vld1.32 {q14,q15}, [r12]! - veor q4, q4, q12 - veor q5, q5, q13 - veor q6, q6, q14 - veor q7, q7, q15 -.Lchacha_blocks_neon_nomessage14: - vst1.32 {q4,q5}, [r14]! - vst1.32 {q6,q7}, [r14]! - beq .Lchacha_blocks_neon_nomessage15 - vld1.32 {q12,q13}, [r12]! - vld1.32 {q14,q15}, [r12]! - veor q8, q8, q12 - veor q9, q9, q13 - veor q10, q10, q14 - veor q11, q11, q15 -.Lchacha_blocks_neon_nomessage15: - vst1.32 {q8,q9}, [r14]! - vst1.32 {q10,q11}, [r14]! - str r12, [sp, #48] - str r14, [sp, #40] - ldr r3, [sp, #52] - sub r3, r3, #256 - cmp r3, #256 - str r3, [sp, #52] - bhs .Lchacha_blocks_neon_mainloop1 - tst r3, r3 - beq .Lchacha_blocks_neon_done -.Lchacha_blocks_neon_mainloop2: - ldr r3, [sp, #52] - ldr r1, [sp, #48] - cmp r3, #64 - bhs .Lchacha_blocks_neon_noswap1 - add r4, sp, #128 - mov r5, r4 - tst r1, r1 - beq .Lchacha_blocks_neon_nocopy1 -.Lchacha_blocks_neon_copyinput1: - subs r3, r3, #1 - ldrb r0, [r1], #1 - strb r0, [r4], #1 - bne .Lchacha_blocks_neon_copyinput1 - str r5, [sp, #48] -.Lchacha_blocks_neon_nocopy1: - ldr r4, [sp, #40] - str r5, [sp, #40] - str r4, [sp, #56] -.Lchacha_blocks_neon_noswap1: - ldr r0, [sp, #44] - str r0, [sp, #0] - add r0, sp, #64 - ldm r0, {r0-r12} - ldr r14, [sp, #(64 +60)] - str r6, [sp, #8] - str r11, [sp, #12] - str r14, [sp, #28] - ldr r11, [sp, #(64 +52)] - ldr r14, [sp, #(64 +56)] -.Lchacha_blocks_neon_rounds2: - ldr r6, [sp, #0] - add r0, r0, r4 - add r1, r1, r5 - eor r12, r12, r0 - eor r11, r11, r1 - ror r12, r12, #16 - ror r11, r11, #16 - subs r6, r6, #2 - add r8, r8, r12 - add r9, r9, r11 - eor r4, r4, r8 - eor r5, r5, r9 - str r6, [sp, #0] - ror r4, r4, #20 - ror r5, r5, #20 - add r0, r0, r4 - add r1, r1, r5 - ldr r6, [sp, #8] - eor r12, r12, r0 - eor r11, r11, r1 - ror r12, r12, #24 - ror r11, r11, #24 - add r8, r8, r12 - add r9, r9, r11 - eor r4, r4, r8 - eor r5, r5, r9 - str r11, [sp, #20] - ror r4, r4, #25 - ror r5, r5, #25 - str r4, [sp, #4] - ldr r4, [sp, #28] - add r2, r2, r6 - add r3, r3, r7 - ldr r11, [sp, #12] - eor r14, r14, r2 - eor r4, r4, r3 - ror r14, r14, #16 - ror r4, r4, #16 - add r10, r10, r14 - add r11, r11, r4 - eor r6, r6, r10 - eor r7, r7, r11 - ror r6, r6, #20 - ror r7, r7, #20 - add r2, r2, r6 - add r3, r3, r7 - eor r14, r14, r2 - eor r4, r4, r3 - ror r14, r14, #24 - ror r4, r4, #24 - add r10, r10, r14 - add r11, r11, r4 - eor r6, r6, r10 - eor r7, r7, r11 - ror r6, r6, #25 - ror r7, r7, #25 - add r0, r0, r5 - add r1, r1, r6 - eor r4, r4, r0 - eor r12, r12, r1 - ror r4, r4, #16 - ror r12, r12, #16 - add r10, r10, r4 - add r11, r11, r12 - eor r5, r5, r10 - eor r6, r6, r11 - ror r5, r5, #20 - ror r6, r6, #20 - add r0, r0, r5 - add r1, r1, r6 - eor r4, r4, r0 - eor r12, r12, r1 - ror r4, r4, #24 - ror r12, r12, #24 - add r10, r10, r4 - add r11, r11, r12 - eor r5, r5, r10 - eor r6, r6, r11 - str r11, [sp, #12] - ror r5, r5, #25 - ror r6, r6, #25 - str r4, [sp, #28] - ldr r4, [sp, #4] - add r2, r2, r7 - add r3, r3, r4 - ldr r11, [sp, #20] - eor r11, r11, r2 - eor r14, r14, r3 - ror r11, r11, #16 - ror r14, r14, #16 - add r8, r8, r11 - add r9, r9, r14 - eor r7, r7, r8 - eor r4, r4, r9 - ror r7, r7, #20 - ror r4, r4, #20 - str r6, [sp, #8] - add r2, r2, r7 - add r3, r3, r4 - eor r11, r11, r2 - eor r14, r14, r3 - ror r11, r11, #24 - ror r14, r14, #24 - add r8, r8, r11 - add r9, r9, r14 - eor r7, r7, r8 - eor r4, r4, r9 - ror r7, r7, #25 - ror r4, r4, #25 - bne .Lchacha_blocks_neon_rounds2 - str r8, [sp, #0] - str r9, [sp, #4] - str r10, [sp, #8] - str r12, [sp, #16] - str r11, [sp, #20] - str r14, [sp, #24] - ldr r12, [sp, #48] - ldr r14, [sp, #40] - ldr r8, [sp, #(64 +0)] - ldr r9, [sp, #(64 +4)] - ldr r10, [sp, #(64 +8)] - ldr r11, [sp, #(64 +12)] - add r0, r0, r8 - add r1, r1, r9 - add r2, r2, r10 - ldr r8, [sp, #(64 +16)] - add r3, r3, r11 - ldr r9, [sp, #(64 +20)] - add r4, r4, r8 - ldr r10, [sp, #(64 +24)] - add r5, r5, r9 - ldr r11, [sp, #(64 +28)] - add r6, r6, r10 - tst r12, r12 - add r7, r7, r11 - beq .Lchacha_blocks_neon_nomessage21 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage21: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - ldm sp, {r0-r7} - ldr r8, [sp, #(64 +32)] - ldr r9, [sp, #(64 +36)] - ldr r10, [sp, #(64 +40)] - ldr r11, [sp, #(64 +44)] - add r0, r0, r8 - add r1, r1, r9 - add r2, r2, r10 - ldr r8, [sp, #(64 +48)] - add r3, r3, r11 - ldr r9, [sp, #(64 +52)] - add r4, r4, r8 - ldr r10, [sp, #(64 +56)] - add r5, r5, r9 - ldr r11, [sp, #(64 +60)] - add r6, r6, r10 - adds r8, r8, #1 - add r7, r7, r11 - adc r9, r9, #0 - str r8, [sp, #(64 +48)] - tst r12, r12 - str r9, [sp, #(64 +52)] - beq .Lchacha_blocks_neon_nomessage22 - UNALIGNED_LDMIA4(r12, r8, r9, r10, r11) - tst r12, r12 - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - ldr r8, [r12, #0] - eor r3, r3, r11 - ldr r9, [r12, #4] - eor r4, r4, r8 - ldr r10, [r12, #8] - eor r5, r5, r9 - ldr r11, [r12, #12] - eor r6, r6, r10 - add r12, r12, #16 - eor r7, r7, r11 -.Lchacha_blocks_neon_nomessage22: - UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7) - str r12, [sp, #48] - str r14, [sp, #40] - ldr r3, [sp, #52] - cmp r3, #64 - sub r4, r3, #64 - str r4, [sp, #52] - bhi .Lchacha_blocks_neon_mainloop2 - cmp r3, #64 - beq .Lchacha_blocks_neon_nocopy2 - ldr r1, [sp, #56] - sub r14, r14, #64 -.Lchacha_blocks_neon_copyinput2: - subs r3, r3, #1 - ldrb r0, [r14], #1 - strb r0, [r1], #1 - bne .Lchacha_blocks_neon_copyinput2 -.Lchacha_blocks_neon_nocopy2: -.Lchacha_blocks_neon_done: - ldr r7, [sp, #60] - ldr r8, [sp, #(64 +48)] - ldr r9, [sp, #(64 +52)] - str r8, [r7, #(48 + 0)] - str r9, [r7, #(48 + 4)] +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + +/* register macros */ +#define INPUT r0 +#define DST r1 +#define SRC r2 +#define NBLKS r3 +#define ROUND r4 + +/* stack structure */ +#define STACK_VEC_X12 (16) +#define STACK_VEC_X13 (STACK_VEC_X12 + 16) +#define STACK_TMP (STACK_VEC_X13 + 16) +#define STACK_TMP1 (16 + STACK_TMP) +#define STACK_TMP2 (16 + STACK_TMP1) + +#define STACK_MAX (16 + STACK_TMP2) + +/* vector registers */ +#define X0 q0 +#define X1 q1 +#define X2 q2 +#define X3 q3 +#define X4 q4 +#define X5 q5 +#define X6 q6 +#define X7 q7 +#define X8 q8 +#define X9 q9 +#define X10 q10 +#define X11 q11 +#define X12 q12 +#define X13 q13 +#define X14 q14 +#define X15 q15 + +#define X0l d0 +#define X1l d2 +#define X2l d4 +#define X3l d6 +#define X4l d8 +#define X5l d10 +#define X6l d12 +#define X7l d14 +#define X8l d16 +#define X9l d18 +#define X10l d20 +#define X11l d22 +#define X12l d24 +#define X13l d26 +#define X14l d28 +#define X15l d30 + +#define X0h d1 +#define X1h d3 +#define X2h d5 +#define X3h d7 +#define X4h d9 +#define X5h d11 +#define X6h d13 +#define X7h d15 +#define X8h d17 +#define X9h d19 +#define X10h d21 +#define X11h d23 +#define X12h d25 +#define X13h d27 +#define X14h d29 +#define X15h d31 + +/********************************************************************** + helper macros + **********************************************************************/ + +/* 4x4 32-bit integer matrix transpose */ +#define transpose_4x4_part1(_q0, _q1, _q2, _q3) \ + vtrn.32 _q0, _q1; \ + vtrn.32 _q2, _q3; +#define transpose_4x4_part2(_q0, _q1, _q2, _q3) \ + vswp _q0##h, _q2##l; \ + vswp _q1##h, _q3##l; + +#define clear(x) veor x,x,x; + +/********************************************************************** + 4-way chacha20 + **********************************************************************/ + +#define ROTATE2(dst1,dst2,c,src1,src2) \ + vshl.u32 dst1, src1, #(c); \ + vshl.u32 dst2, src2, #(c); \ + vsri.u32 dst1, src1, #(32 - (c)); \ + vsri.u32 dst2, src2, #(32 - (c)); + +#define ROTATE2_16(dst1,dst2,src1,src2) \ + vrev32.16 dst1, src1; \ + vrev32.16 dst2, src2; + +#define XOR(d,s1,s2) \ + veor d, s2, s1; + +#define PLUS(ds,s) \ + vadd.u32 ds, ds, s; + +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \ + PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \ + ROTATE2_16(d1, d2, tmp1, tmp2); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \ + ROTATE2(b1, b2, 12, tmp1, tmp2); \ + PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \ + ROTATE2(d1, d2, 8, tmp1, tmp2); \ + PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \ + ROTATE2(b1, b2, 7, tmp1, tmp2); + +chacha20_data: +.align 4 +.Linc_counter: + .long 0,1,2,3 + +.align 3 +.globl _gcry_chacha20_armv7_neon_blocks4 +.type _gcry_chacha20_armv7_neon_blocks4,%function; + +_gcry_chacha20_armv7_neon_blocks4: + /* input: + * r0: input + * r1: dst + * r2: src + * r3: nblks (multiple of 4) + */ + + vpush {q4-q7}; + push {r4-r12,lr}; + mov r12, sp - stmia r12!, {r0-r7} - add r12, r12, #48 - stmia r12!, {r0-r7} - sub r0, sp, #8 - ldr sp, [sp, #192] - ldmfd sp!, {r4-r12, r14} - vldm sp!, {q4-q7} - sub r0, sp, r0 - bx lr -.Lchacha_blocks_neon_nobytes: - mov r0, #0; + + mov r6, sp; + sub r6, r6, #(STACK_MAX); + and r6, r6, #(~15); + mov sp, r6; + GET_DATA_POINTER(r9, .Linc_counter, lr); + add lr, INPUT, #(12*4); + add r8, sp, #STACK_VEC_X12; + +.Loop4: + mov ROUND, #20; + + /* Construct counter vectors X12 and X13 */ + + vld1.8 {X15}, [lr]; + mov lr, INPUT; + vld1.8 {X8}, [r9]; + vdup.32 X12, X15l[0]; + vdup.32 X13, X15l[1]; + vld1.8 {X3}, [lr]!; + vadd.u32 X12, X12, X8; + vdup.32 X0, X3l[0]; + vdup.32 X1, X3l[1]; + vdup.32 X2, X3h[0]; + vcgt.u32 X8, X8, X12; + vdup.32 X3, X3h[1]; + vdup.32 X14, X15h[0]; + vdup.32 X15, X15h[1]; + vsub.u32 X13, X13, X8; + vld1.8 {X7}, [lr]!; + vld1.8 {X11}, [lr]; + vst1.8 {X12, X13}, [r8]; + vdup.32 X4, X7l[0]; + vdup.32 X5, X7l[1]; + vdup.32 X6, X7h[0]; + vdup.32 X7, X7h[1]; + vdup.32 X8, X11l[0]; + vdup.32 X9, X11l[1]; + vdup.32 X10, X11h[0]; + vdup.32 X11, X11h[1]; + + add r7, sp, #STACK_TMP2; + add r6, sp, #STACK_TMP1; + add r5, sp, #STACK_TMP; + vst1.8 {X15}, [r6]; + vst1.8 {X11}, [r5]; + + mov lr, INPUT; +.Lround2: + subs ROUND, ROUND, #2 + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15) + vld1.8 {X11}, [r5]; + vld1.8 {X15}, [r6]; + vst1.8 {X8}, [r5]; + vst1.8 {X9}, [r6]; + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9) + vld1.8 {X8}, [r5]; + vld1.8 {X9}, [r6]; + vst1.8 {X11}, [r5]; + vst1.8 {X15}, [r6]; + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15) + bne .Lround2; + + vld1.8 {X11}, [lr]!; + vst1.8 {X14}, [r7]; + + vdup.32 X14, X11l[0]; /* INPUT + 0 * 4 */ + vdup.32 X15, X11l[1]; /* INPUT + 1 * 4 */ + PLUS(X0, X14); + PLUS(X1, X15); + vdup.32 X14, X11h[0]; /* INPUT + 2 * 4 */ + vdup.32 X15, X11h[1]; /* INPUT + 3 * 4 */ + PLUS(X2, X14); + PLUS(X3, X15); + + vld1.8 {X11}, [r5]; + vld1.8 {X15}, [r6]; + vst1.8 {X0}, [r5]; + vld1.8 {X0}, [lr]!; + vst1.8 {X1}, [r6]; + + vdup.32 X14, X0l[0]; /* INPUT + 4 * 4 */ + vdup.32 X1, X0l[1]; /* INPUT + 5 * 4 */ + PLUS(X4, X14); + PLUS(X5, X1); + vdup.32 X14, X0h[0]; /* INPUT + 6 * 4 */ + vdup.32 X1, X0h[1]; /* INPUT + 7 * 4 */ + PLUS(X6, X14); + PLUS(X7, X1); + + vld1.8 {X0}, [lr]!; + + vdup.32 X14, X0l[0]; /* INPUT + 8 * 4 */ + vdup.32 X1, X0l[1]; /* INPUT + 9 * 4 */ + PLUS(X8, X14); + PLUS(X9, X1); + vdup.32 X14, X0h[0]; /* INPUT + 10 * 4 */ + vdup.32 X1, X0h[1]; /* INPUT + 11 * 4 */ + PLUS(X10, X14); + PLUS(X11, X1); + + vld1.8 {X0}, [lr]; + add lr, INPUT, #(12*4) + vld1.8 {X14}, [r7]; + + vdup.32 X1, X0h[0]; /* INPUT + 10 * 4 */ + ldm lr, {r10, r11}; /* Update counter */ + vdup.32 X0, X0h[1]; /* INPUT + 11 * 4 */ + PLUS(X14, X1); + PLUS(X15, X0); + adds r10, r10, #4; /* Update counter */ + vld1.8 {X0, X1}, [r8]; + + PLUS(X12, X0); + vld1.8 {X0}, [r5]; + PLUS(X13, X1); + adc r11, r11, #0; /* Update counter */ + + vld1.8 {X1}, [r6]; + stm lr, {r10, r11}; /* Update counter */ + transpose_4x4_part1(X0, X1, X2, X3); + transpose_4x4_part1(X4, X5, X6, X7); + transpose_4x4_part1(X8, X9, X10, X11); + transpose_4x4_part1(X12, X13, X14, X15); + transpose_4x4_part2(X0, X1, X2, X3); + transpose_4x4_part2(X4, X5, X6, X7); + transpose_4x4_part2(X8, X9, X10, X11); + transpose_4x4_part2(X12, X13, X14, X15); + + subs NBLKS, NBLKS, #4; + + vst1.8 {X10}, [r5]; + add lr, INPUT, #(12*4) + vst1.8 {X11}, [r6]; + vld1.8 {X10, X11}, [SRC]!; + veor X10, X0, X10; + vld1.8 {X0}, [SRC]!; + veor X11, X4, X11; + vld1.8 {X4}, [SRC]!; + vst1.8 {X10, X11}, [DST]!; + vld1.8 {X10, X11}, [SRC]!; + veor X0, X8, X0; + veor X4, X12, X4; + veor X10, X1, X10; + veor X11, X5, X11; + vst1.8 {X0}, [DST]!; + vld1.8 {X0, X1}, [SRC]!; + vst1.8 {X4}, [DST]!; + vld1.8 {X4, X5}, [SRC]!; + vst1.8 {X10, X11}, [DST]!; + vld1.8 {X10}, [r5]; + vld1.8 {X11}, [r6]; + veor X0, X9, X0; + vld1.8 {X8, X9}, [SRC]!; + veor X1, X13, X1; + vld1.8 {X12, X13}, [SRC]!; + veor X4, X2, X4; + veor X5, X6, X5; + vst1.8 {X0, X1}, [DST]!; + vld1.8 {X0, X1}, [SRC]!; + vst1.8 {X4, X5}, [DST]!; + veor X8, X10, X8; + veor X9, X14, X9; + veor X12, X3, X12; + veor X13, X7, X13; + veor X0, X11, X0; + veor X1, X15, X1; + vst1.8 {X8, X9}, [DST]!; + vst1.8 {X12, X13}, [DST]!; + vst1.8 {X0, X1}, [DST]!; + + bne .Loop4; + + /* clear the used vector registers and stack */ + clear(X0); + vst1.8 {X0}, [r5]; + vst1.8 {X0}, [r6]; + vst1.8 {X0}, [r7]; + vst1.8 {X0}, [r8]!; + vst1.8 {X0}, [r8]; + + mov sp, r12 + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X8); + clear(X9); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + clear(X14); + clear(X15); + + pop {r4-r12,lr} + vpop {q4-q7} + eor r0, r0, r0 bx lr -.ltorg -.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks; +.size _gcry_chacha20_armv7_neon_blocks4, .-_gcry_chacha20_armv7_neon_blocks4; #endif diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S deleted file mode 100644 index 8c085bad..00000000 --- a/cipher/chacha20-avx2-amd64.S +++ /dev/null @@ -1,956 +0,0 @@ -/* chacha20-avx2-amd64.S - AMD64/AVX2 implementation of ChaCha20 - * - * Copyright (C) 2014 Jussi Kivilinna - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see . - */ - -/* - * Based on public domain implementation by Andrew Moon at - * https://github.com/floodyberry/chacha-opt - */ - -#ifdef __x86_64__ -#include - -#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ - defined(ENABLE_AVX2_SUPPORT) && USE_CHACHA20 - -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -.text - -.align 8 -.globl _gcry_chacha20_amd64_avx2_blocks -ELF(.type _gcry_chacha20_amd64_avx2_blocks,@function;) -_gcry_chacha20_amd64_avx2_blocks: -.Lchacha_blocks_avx2_local: - vzeroupper - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - movq %rsp, %rbp - andq $~63, %rsp - subq $512, %rsp - leaq .LC RIP, %rax - vmovdqu 0(%rax), %xmm6 - vmovdqu 16(%rax), %xmm7 - vmovdqu 0(%rdi), %xmm8 - vmovdqu 16(%rdi), %xmm9 - vmovdqu 32(%rdi), %xmm10 - vmovdqu 48(%rdi), %xmm11 - movl $20, %eax - movq $1, %r9 - vmovdqa %xmm8, 0(%rsp) - vmovdqa %xmm9, 16(%rsp) - vmovdqa %xmm10, 32(%rsp) - vmovdqa %xmm11, 48(%rsp) - movq %rax, 64(%rsp) - vmovdqa %xmm6, 448(%rsp) - vmovdqa %xmm6, 464(%rsp) - vmovdqa %xmm7, 480(%rsp) - vmovdqa %xmm7, 496(%rsp) - cmpq $512, %rcx - jae .Lchacha_blocks_avx2_atleast512 - cmp $256, %rcx - jae .Lchacha_blocks_avx2_atleast256 - jmp .Lchacha_blocks_avx2_below256 - .p2align 6,,63 -.Lchacha_blocks_avx2_atleast512: - movq 48(%rsp), %rax - leaq 1(%rax), %r8 - leaq 2(%rax), %r9 - leaq 3(%rax), %r10 - leaq 4(%rax), %rbx - leaq 5(%rax), %r11 - leaq 6(%rax), %r12 - leaq 7(%rax), %r13 - leaq 8(%rax), %r14 - movl %eax, 128(%rsp) - movl %r8d, 4+128(%rsp) - movl %r9d, 8+128(%rsp) - movl %r10d, 12+128(%rsp) - movl %ebx, 16+128(%rsp) - movl %r11d, 20+128(%rsp) - movl %r12d, 24+128(%rsp) - movl %r13d, 28+128(%rsp) - shrq $32, %rax - shrq $32, %r8 - shrq $32, %r9 - shrq $32, %r10 - shrq $32, %rbx - shrq $32, %r11 - shrq $32, %r12 - shrq $32, %r13 - movl %eax, 160(%rsp) - movl %r8d, 4+160(%rsp) - movl %r9d, 8+160(%rsp) - movl %r10d, 12+160(%rsp) - movl %ebx, 16+160(%rsp) - movl %r11d, 20+160(%rsp) - movl %r12d, 24+160(%rsp) - movl %r13d, 28+160(%rsp) - movq %r14, 48(%rsp) - movq 64(%rsp), %rax - vpbroadcastd 0(%rsp), %ymm0 - vpbroadcastd 4+0(%rsp), %ymm1 - vpbroadcastd 8+0(%rsp), %ymm2 - vpbroadcastd 12+0(%rsp), %ymm3 - vpbroadcastd 16(%rsp), %ymm4 - vpbroadcastd 4+16(%rsp), %ymm5 - vpbroadcastd 8+16(%rsp), %ymm6 - vpbroadcastd 12+16(%rsp), %ymm7 - vpbroadcastd 32(%rsp), %ymm8 - vpbroadcastd 4+32(%rsp), %ymm9 - vpbroadcastd 8+32(%rsp), %ymm10 - vpbroadcastd 12+32(%rsp), %ymm11 - vpbroadcastd 8+48(%rsp), %ymm14 - vpbroadcastd 12+48(%rsp), %ymm15 - vmovdqa 128(%rsp), %ymm12 - vmovdqa 160(%rsp), %ymm13 -.Lchacha_blocks_avx2_mainloop1: - vpaddd %ymm0, %ymm4, %ymm0 - vpaddd %ymm1, %ymm5, %ymm1 - vpxor %ymm12, %ymm0, %ymm12 - vpxor %ymm13, %ymm1, %ymm13 - vpaddd %ymm2, %ymm6, %ymm2 - vpaddd %ymm3, %ymm7, %ymm3 - vpxor %ymm14, %ymm2, %ymm14 - vpxor %ymm15, %ymm3, %ymm15 - vpshufb 448(%rsp), %ymm12, %ymm12 - vpshufb 448(%rsp), %ymm13, %ymm13 - vpaddd %ymm8, %ymm12, %ymm8 - vpaddd %ymm9, %ymm13, %ymm9 - vpshufb 448(%rsp), %ymm14, %ymm14 - vpshufb 448(%rsp), %ymm15, %ymm15 - vpaddd %ymm10, %ymm14, %ymm10 - vpaddd %ymm11, %ymm15, %ymm11 - vmovdqa %ymm12, 96(%rsp) - vpxor %ymm4, %ymm8, %ymm4 - vpxor %ymm5, %ymm9, %ymm5 - vpslld $ 12, %ymm4, %ymm12 - vpsrld $20, %ymm4, %ymm4 - vpxor %ymm4, %ymm12, %ymm4 - vpslld $ 12, %ymm5, %ymm12 - vpsrld $20, %ymm5, %ymm5 - vpxor %ymm5, %ymm12, %ymm5 - vpxor %ymm6, %ymm10, %ymm6 - vpxor %ymm7, %ymm11, %ymm7 - vpslld $ 12, %ymm6, %ymm12 - vpsrld $20, %ymm6, %ymm6 - vpxor %ymm6, %ymm12, %ymm6 - vpslld $ 12, %ymm7, %ymm12 - vpsrld $20, %ymm7, %ymm7 - vpxor %ymm7, %ymm12, %ymm7 - vpaddd %ymm0, %ymm4, %ymm0 - vpaddd %ymm1, %ymm5, %ymm1 - vpxor 96(%rsp), %ymm0, %ymm12 - vpxor %ymm13, %ymm1, %ymm13 - vpaddd %ymm2, %ymm6, %ymm2 - vpaddd %ymm3, %ymm7, %ymm3 - vpxor %ymm14, %ymm2, %ymm14 - vpxor %ymm15, %ymm3, %ymm15 - vpshufb 480(%rsp), %ymm12, %ymm12 - vpshufb 480(%rsp), %ymm13, %ymm13 - vpaddd %ymm8, %ymm12, %ymm8 - vpaddd %ymm9, %ymm13, %ymm9 - vpshufb 480(%rsp), %ymm14, %ymm14 - vpshufb 480(%rsp), %ymm15, %ymm15 - vpaddd %ymm10, %ymm14, %ymm10 - vpaddd %ymm11, %ymm15, %ymm11 - vmovdqa %ymm12, 96(%rsp) - vpxor %ymm4, %ymm8, %ymm4 - vpxor %ymm5, %ymm9, %ymm5 - vpslld $ 7, %ymm4, %ymm12 - vpsrld $25, %ymm4, %ymm4 - vpxor %ymm4, %ymm12, %ymm4 - vpslld $ 7, %ymm5, %ymm12 - vpsrld $25, %ymm5, %ymm5 - vpxor %ymm5, %ymm12, %ymm5 - vpxor %ymm6, %ymm10, %ymm6 - vpxor %ymm7, %ymm11, %ymm7 - vpslld $ 7, %ymm6, %ymm12 - vpsrld $25, %ymm6, %ymm6 - vpxor %ymm6, %ymm12, %ymm6 - vpslld $ 7, %ymm7, %ymm12 - vpsrld $25, %ymm7, %ymm7 - vpxor %ymm7, %ymm12, %ymm7 - vpaddd %ymm0, %ymm5, %ymm0 - vpaddd %ymm1, %ymm6, %ymm1 - vpxor %ymm15, %ymm0, %ymm15 - vpxor 96(%rsp), %ymm1, %ymm12 - vpaddd %ymm2, %ymm7, %ymm2 - vpaddd %ymm3, %ymm4, %ymm3 - vpxor %ymm13, %ymm2, %ymm13 - vpxor %ymm14, %ymm3, %ymm14 - vpshufb 448(%rsp), %ymm15, %ymm15 - vpshufb 448(%rsp), %ymm12, %ymm12 - vpaddd %ymm10, %ymm15, %ymm10 - vpaddd %ymm11, %ymm12, %ymm11 - vpshufb 448(%rsp), %ymm13, %ymm13 - vpshufb 448(%rsp), %ymm14, %ymm14 - vpaddd %ymm8, %ymm13, %ymm8 - vpaddd %ymm9, %ymm14, %ymm9 - vmovdqa %ymm15, 96(%rsp) - vpxor %ymm5, %ymm10, %ymm5 - vpxor %ymm6, %ymm11, %ymm6 - vpslld $ 12, %ymm5, %ymm15 - vpsrld $20, %ymm5, %ymm5 - vpxor %ymm5, %ymm15, %ymm5 - vpslld $ 12, %ymm6, %ymm15 - vpsrld $20, %ymm6, %ymm6 - vpxor %ymm6, %ymm15, %ymm6 - vpxor %ymm7, %ymm8, %ymm7 - vpxor %ymm4, %ymm9, %ymm4 - vpslld $ 12, %ymm7, %ymm15 - vpsrld $20, %ymm7, %ymm7 - vpxor %ymm7, %ymm15, %ymm7 - vpslld $ 12, %ymm4, %ymm15 - vpsrld $20, %ymm4, %ymm4 - vpxor %ymm4, %ymm15, %ymm4 - vpaddd %ymm0, %ymm5, %ymm0 - vpaddd %ymm1, %ymm6, %ymm1 - vpxor 96(%rsp), %ymm0, %ymm15 - vpxor %ymm12, %ymm1, %ymm12 - vpaddd %ymm2, %ymm7, %ymm2 - vpaddd %ymm3, %ymm4, %ymm3 - vpxor %ymm13, %ymm2, %ymm13 - vpxor %ymm14, %ymm3, %ymm14 - vpshufb 480(%rsp), %ymm15, %ymm15 - vpshufb 480(%rsp), %ymm12, %ymm12 - vpaddd %ymm10, %ymm15, %ymm10 - vpaddd %ymm11, %ymm12, %ymm11 - vpshufb 480(%rsp), %ymm13, %ymm13 - vpshufb 480(%rsp), %ymm14, %ymm14 - vpaddd %ymm8, %ymm13, %ymm8 - vpaddd %ymm9, %ymm14, %ymm9 - vmovdqa %ymm15, 96(%rsp) - vpxor %ymm5, %ymm10, %ymm5 - vpxor %ymm6, %ymm11, %ymm6 - vpslld $ 7, %ymm5, %ymm15 - vpsrld $25, %ymm5, %ymm5 - vpxor %ymm5, %ymm15, %ymm5 - vpslld $ 7, %ymm6, %ymm15 - vpsrld $25, %ymm6, %ymm6 - vpxor %ymm6, %ymm15, %ymm6 - vpxor %ymm7, %ymm8, %ymm7 - vpxor %ymm4, %ymm9, %ymm4 - vpslld $ 7, %ymm7, %ymm15 - vpsrld $25, %ymm7, %ymm7 - vpxor %ymm7, %ymm15, %ymm7 - vpslld $ 7, %ymm4, %ymm15 - vpsrld $25, %ymm4, %ymm4 - vpxor %ymm4, %ymm15, %ymm4 - vmovdqa 96(%rsp), %ymm15 - subq $2, %rax - jnz .Lchacha_blocks_avx2_mainloop1 - vmovdqa %ymm8, 192(%rsp) - vmovdqa %ymm9, 224(%rsp) - vmovdqa %ymm10, 256(%rsp) - vmovdqa %ymm11, 288(%rsp) - vmovdqa %ymm12, 320(%rsp) - vmovdqa %ymm13, 352(%rsp) - vmovdqa %ymm14, 384(%rsp) - vmovdqa %ymm15, 416(%rsp) - vpbroadcastd 0(%rsp), %ymm8 - vpbroadcastd 4+0(%rsp), %ymm9 - vpbroadcastd 8+0(%rsp), %ymm10 - vpbroadcastd 12+0(%rsp), %ymm11 - vpbroadcastd 16(%rsp), %ymm12 - vpbroadcastd 4+16(%rsp), %ymm13 - vpbroadcastd 8+16(%rsp), %ymm14 - vpbroadcastd 12+16(%rsp), %ymm15 - vpaddd %ymm8, %ymm0, %ymm0 - vpaddd %ymm9, %ymm1, %ymm1 - vpaddd %ymm10, %ymm2, %ymm2 - vpaddd %ymm11, %ymm3, %ymm3 - vpaddd %ymm12, %ymm4, %ymm4 - vpaddd %ymm13, %ymm5, %ymm5 - vpaddd %ymm14, %ymm6, %ymm6 - vpaddd %ymm15, %ymm7, %ymm7 - vpunpckldq %ymm1, %ymm0, %ymm8 - vpunpckldq %ymm3, %ymm2, %ymm9 - vpunpckhdq %ymm1, %ymm0, %ymm12 - vpunpckhdq %ymm3, %ymm2, %ymm13 - vpunpckldq %ymm5, %ymm4, %ymm10 - vpunpckldq %ymm7, %ymm6, %ymm11 - vpunpckhdq %ymm5, %ymm4, %ymm14 - vpunpckhdq %ymm7, %ymm6, %ymm15 - vpunpcklqdq %ymm9, %ymm8, %ymm0 - vpunpcklqdq %ymm11, %ymm10, %ymm1 - vpunpckhqdq %ymm9, %ymm8, %ymm2 - vpunpckhqdq %ymm11, %ymm10, %ymm3 - vpunpcklqdq %ymm13, %ymm12, %ymm4 - vpunpcklqdq %ymm15, %ymm14, %ymm5 - vpunpckhqdq %ymm13, %ymm12, %ymm6 - vpunpckhqdq %ymm15, %ymm14, %ymm7 - vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 - vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 - vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 - vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 - vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 - vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 - vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 - vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 - andq %rsi, %rsi - jz .Lchacha_blocks_avx2_noinput1 - vpxor 0(%rsi), %ymm8, %ymm8 - vpxor 64(%rsi), %ymm9, %ymm9 - vpxor 128(%rsi), %ymm10, %ymm10 - vpxor 192(%rsi), %ymm11, %ymm11 - vpxor 256(%rsi), %ymm12, %ymm12 - vpxor 320(%rsi), %ymm13, %ymm13 - vpxor 384(%rsi), %ymm14, %ymm14 - vpxor 448(%rsi), %ymm15, %ymm15 - vmovdqu %ymm8, 0(%rdx) - vmovdqu %ymm9, 64(%rdx) - vmovdqu %ymm10, 128(%rdx) - vmovdqu %ymm11, 192(%rdx) - vmovdqu %ymm12, 256(%rdx) - vmovdqu %ymm13, 320(%rdx) - vmovdqu %ymm14, 384(%rdx) - vmovdqu %ymm15, 448(%rdx) - vmovdqa 192(%rsp), %ymm0 - vmovdqa 224(%rsp), %ymm1 - vmovdqa 256(%rsp), %ymm2 - vmovdqa 288(%rsp), %ymm3 - vmovdqa 320(%rsp), %ymm4 - vmovdqa 352(%rsp), %ymm5 - vmovdqa 384(%rsp), %ymm6 - vmovdqa 416(%rsp), %ymm7 - vpbroadcastd 32(%rsp), %ymm8 - vpbroadcastd 4+32(%rsp), %ymm9 - vpbroadcastd 8+32(%rsp), %ymm10 - vpbroadcastd 12+32(%rsp), %ymm11 - vmovdqa 128(%rsp), %ymm12 - vmovdqa 160(%rsp), %ymm13 - vpbroadcastd 8+48(%rsp), %ymm14 - vpbroadcastd 12+48(%rsp), %ymm15 - vpaddd %ymm8, %ymm0, %ymm0 - vpaddd %ymm9, %ymm1, %ymm1 - vpaddd %ymm10, %ymm2, %ymm2 - vpaddd %ymm11, %ymm3, %ymm3 - vpaddd %ymm12, %ymm4, %ymm4 - vpaddd %ymm13, %ymm5, %ymm5 - vpaddd %ymm14, %ymm6, %ymm6 - vpaddd %ymm15, %ymm7, %ymm7 - vpunpckldq %ymm1, %ymm0, %ymm8 - vpunpckldq %ymm3, %ymm2, %ymm9 - vpunpckhdq %ymm1, %ymm0, %ymm12 - vpunpckhdq %ymm3, %ymm2, %ymm13 - vpunpckldq %ymm5, %ymm4, %ymm10 - vpunpckldq %ymm7, %ymm6, %ymm11 - vpunpckhdq %ymm5, %ymm4, %ymm14 - vpunpckhdq %ymm7, %ymm6, %ymm15 - vpunpcklqdq %ymm9, %ymm8, %ymm0 - vpunpcklqdq %ymm11, %ymm10, %ymm1 - vpunpckhqdq %ymm9, %ymm8, %ymm2 - vpunpckhqdq %ymm11, %ymm10, %ymm3 - vpunpcklqdq %ymm13, %ymm12, %ymm4 - vpunpcklqdq %ymm15, %ymm14, %ymm5 - vpunpckhqdq %ymm13, %ymm12, %ymm6 - vpunpckhqdq %ymm15, %ymm14, %ymm7 - vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 - vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 - vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 - vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 - vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 - vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 - vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 - vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 - vpxor 32(%rsi), %ymm8, %ymm8 - vpxor 96(%rsi), %ymm9, %ymm9 - vpxor 160(%rsi), %ymm10, %ymm10 - vpxor 224(%rsi), %ymm11, %ymm11 - vpxor 288(%rsi), %ymm12, %ymm12 - vpxor 352(%rsi), %ymm13, %ymm13 - vpxor 416(%rsi), %ymm14, %ymm14 - vpxor 480(%rsi), %ymm15, %ymm15 - vmovdqu %ymm8, 32(%rdx) - vmovdqu %ymm9, 96(%rdx) - vmovdqu %ymm10, 160(%rdx) - vmovdqu %ymm11, 224(%rdx) - vmovdqu %ymm12, 288(%rdx) - vmovdqu %ymm13, 352(%rdx) - vmovdqu %ymm14, 416(%rdx) - vmovdqu %ymm15, 480(%rdx) - addq $512, %rsi - jmp .Lchacha_blocks_avx2_mainloop1_cont -.Lchacha_blocks_avx2_noinput1: - vmovdqu %ymm8, 0(%rdx) - vmovdqu %ymm9, 64(%rdx) - vmovdqu %ymm10, 128(%rdx) - vmovdqu %ymm11, 192(%rdx) - vmovdqu %ymm12, 256(%rdx) - vmovdqu %ymm13, 320(%rdx) - vmovdqu %ymm14, 384(%rdx) - vmovdqu %ymm15, 448(%rdx) - vmovdqa 192(%rsp), %ymm0 - vmovdqa 224(%rsp), %ymm1 - vmovdqa 256(%rsp), %ymm2 - vmovdqa 288(%rsp), %ymm3 - vmovdqa 320(%rsp), %ymm4 - vmovdqa 352(%rsp), %ymm5 - vmovdqa 384(%rsp), %ymm6 - vmovdqa 416(%rsp), %ymm7 - vpbroadcastd 32(%rsp), %ymm8 - vpbroadcastd 4+32(%rsp), %ymm9 - vpbroadcastd 8+32(%rsp), %ymm10 - vpbroadcastd 12+32(%rsp), %ymm11 - vmovdqa 128(%rsp), %ymm12 - vmovdqa 160(%rsp), %ymm13 - vpbroadcastd 8+48(%rsp), %ymm14 - vpbroadcastd 12+48(%rsp), %ymm15 - vpaddd %ymm8, %ymm0, %ymm0 - vpaddd %ymm9, %ymm1, %ymm1 - vpaddd %ymm10, %ymm2, %ymm2 - vpaddd %ymm11, %ymm3, %ymm3 - vpaddd %ymm12, %ymm4, %ymm4 - vpaddd %ymm13, %ymm5, %ymm5 - vpaddd %ymm14, %ymm6, %ymm6 - vpaddd %ymm15, %ymm7, %ymm7 - vpunpckldq %ymm1, %ymm0, %ymm8 - vpunpckldq %ymm3, %ymm2, %ymm9 - vpunpckhdq %ymm1, %ymm0, %ymm12 - vpunpckhdq %ymm3, %ymm2, %ymm13 - vpunpckldq %ymm5, %ymm4, %ymm10 - vpunpckldq %ymm7, %ymm6, %ymm11 - vpunpckhdq %ymm5, %ymm4, %ymm14 - vpunpckhdq %ymm7, %ymm6, %ymm15 - vpunpcklqdq %ymm9, %ymm8, %ymm0 - vpunpcklqdq %ymm11, %ymm10, %ymm1 - vpunpckhqdq %ymm9, %ymm8, %ymm2 - vpunpckhqdq %ymm11, %ymm10, %ymm3 - vpunpcklqdq %ymm13, %ymm12, %ymm4 - vpunpcklqdq %ymm15, %ymm14, %ymm5 - vpunpckhqdq %ymm13, %ymm12, %ymm6 - vpunpckhqdq %ymm15, %ymm14, %ymm7 - vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 - vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 - vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 - vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 - vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 - vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 - vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 - vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 - vmovdqu %ymm8, 32(%rdx) - vmovdqu %ymm9, 96(%rdx) - vmovdqu %ymm10, 160(%rdx) - vmovdqu %ymm11, 224(%rdx) - vmovdqu %ymm12, 288(%rdx) - vmovdqu %ymm13, 352(%rdx) - vmovdqu %ymm14, 416(%rdx) - vmovdqu %ymm15, 480(%rdx) -.Lchacha_blocks_avx2_mainloop1_cont: - addq $512, %rdx - subq $512, %rcx - cmp $512, %rcx - jae .Lchacha_blocks_avx2_atleast512 - cmp $256, %rcx - jb .Lchacha_blocks_avx2_below256_fixup -.Lchacha_blocks_avx2_atleast256: - movq 48(%rsp), %rax - leaq 1(%rax), %r8 - leaq 2(%rax), %r9 - leaq 3(%rax), %r10 - leaq 4(%rax), %rbx - movl %eax, 128(%rsp) - movl %r8d, 4+128(%rsp) - movl %r9d, 8+128(%rsp) - movl %r10d, 12+128(%rsp) - shrq $32, %rax - shrq $32, %r8 - shrq $32, %r9 - shrq $32, %r10 - movl %eax, 160(%rsp) - movl %r8d, 4+160(%rsp) - movl %r9d, 8+160(%rsp) - movl %r10d, 12+160(%rsp) - movq %rbx, 48(%rsp) - movq 64(%rsp), %rax - vpbroadcastd 0(%rsp), %xmm0 - vpbroadcastd 4+0(%rsp), %xmm1 - vpbroadcastd 8+0(%rsp), %xmm2 - vpbroadcastd 12+0(%rsp), %xmm3 - vpbroadcastd 16(%rsp), %xmm4 - vpbroadcastd 4+16(%rsp), %xmm5 - vpbroadcastd 8+16(%rsp), %xmm6 - vpbroadcastd 12+16(%rsp), %xmm7 - vpbroadcastd 32(%rsp), %xmm8 - vpbroadcastd 4+32(%rsp), %xmm9 - vpbroadcastd 8+32(%rsp), %xmm10 - vpbroadcastd 12+32(%rsp), %xmm11 - vmovdqa 128(%rsp), %xmm12 - vmovdqa 160(%rsp), %xmm13 - vpbroadcastd 8+48(%rsp), %xmm14 - vpbroadcastd 12+48(%rsp), %xmm15 -.Lchacha_blocks_avx2_mainloop2: - vpaddd %xmm0, %xmm4, %xmm0 - vpaddd %xmm1, %xmm5, %xmm1 - vpxor %xmm12, %xmm0, %xmm12 - vpxor %xmm13, %xmm1, %xmm13 - vpaddd %xmm2, %xmm6, %xmm2 - vpaddd %xmm3, %xmm7, %xmm3 - vpxor %xmm14, %xmm2, %xmm14 - vpxor %xmm15, %xmm3, %xmm15 - vpshufb 448(%rsp), %xmm12, %xmm12 - vpshufb 448(%rsp), %xmm13, %xmm13 - vpaddd %xmm8, %xmm12, %xmm8 - vpaddd %xmm9, %xmm13, %xmm9 - vpshufb 448(%rsp), %xmm14, %xmm14 - vpshufb 448(%rsp), %xmm15, %xmm15 - vpaddd %xmm10, %xmm14, %xmm10 - vpaddd %xmm11, %xmm15, %xmm11 - vmovdqa %xmm12, 96(%rsp) - vpxor %xmm4, %xmm8, %xmm4 - vpxor %xmm5, %xmm9, %xmm5 - vpslld $ 12, %xmm4, %xmm12 - vpsrld $20, %xmm4, %xmm4 - vpxor %xmm4, %xmm12, %xmm4 - vpslld $ 12, %xmm5, %xmm12 - vpsrld $20, %xmm5, %xmm5 - vpxor %xmm5, %xmm12, %xmm5 - vpxor %xmm6, %xmm10, %xmm6 - vpxor %xmm7, %xmm11, %xmm7 - vpslld $ 12, %xmm6, %xmm12 - vpsrld $20, %xmm6, %xmm6 - vpxor %xmm6, %xmm12, %xmm6 - vpslld $ 12, %xmm7, %xmm12 - vpsrld $20, %xmm7, %xmm7 - vpxor %xmm7, %xmm12, %xmm7 - vpaddd %xmm0, %xmm4, %xmm0 - vpaddd %xmm1, %xmm5, %xmm1 - vpxor 96(%rsp), %xmm0, %xmm12 - vpxor %xmm13, %xmm1, %xmm13 - vpaddd %xmm2, %xmm6, %xmm2 - vpaddd %xmm3, %xmm7, %xmm3 - vpxor %xmm14, %xmm2, %xmm14 - vpxor %xmm15, %xmm3, %xmm15 - vpshufb 480(%rsp), %xmm12, %xmm12 - vpshufb 480(%rsp), %xmm13, %xmm13 - vpaddd %xmm8, %xmm12, %xmm8 - vpaddd %xmm9, %xmm13, %xmm9 - vpshufb 480(%rsp), %xmm14, %xmm14 - vpshufb 480(%rsp), %xmm15, %xmm15 - vpaddd %xmm10, %xmm14, %xmm10 - vpaddd %xmm11, %xmm15, %xmm11 - vmovdqa %xmm12, 96(%rsp) - vpxor %xmm4, %xmm8, %xmm4 - vpxor %xmm5, %xmm9, %xmm5 - vpslld $ 7, %xmm4, %xmm12 - vpsrld $25, %xmm4, %xmm4 - vpxor %xmm4, %xmm12, %xmm4 - vpslld $ 7, %xmm5, %xmm12 - vpsrld $25, %xmm5, %xmm5 - vpxor %xmm5, %xmm12, %xmm5 - vpxor %xmm6, %xmm10, %xmm6 - vpxor %xmm7, %xmm11, %xmm7 - vpslld $ 7, %xmm6, %xmm12 - vpsrld $25, %xmm6, %xmm6 - vpxor %xmm6, %xmm12, %xmm6 - vpslld $ 7, %xmm7, %xmm12 - vpsrld $25, %xmm7, %xmm7 - vpxor %xmm7, %xmm12, %xmm7 - vpaddd %xmm0, %xmm5, %xmm0 - vpaddd %xmm1, %xmm6, %xmm1 - vpxor %xmm15, %xmm0, %xmm15 - vpxor 96(%rsp), %xmm1, %xmm12 - vpaddd %xmm2, %xmm7, %xmm2 - vpaddd %xmm3, %xmm4, %xmm3 - vpxor %xmm13, %xmm2, %xmm13 - vpxor %xmm14, %xmm3, %xmm14 - vpshufb 448(%rsp), %xmm15, %xmm15 - vpshufb 448(%rsp), %xmm12, %xmm12 - vpaddd %xmm10, %xmm15, %xmm10 - vpaddd %xmm11, %xmm12, %xmm11 - vpshufb 448(%rsp), %xmm13, %xmm13 - vpshufb 448(%rsp), %xmm14, %xmm14 - vpaddd %xmm8, %xmm13, %xmm8 - vpaddd %xmm9, %xmm14, %xmm9 - vmovdqa %xmm15, 96(%rsp) - vpxor %xmm5, %xmm10, %xmm5 - vpxor %xmm6, %xmm11, %xmm6 - vpslld $ 12, %xmm5, %xmm15 - vpsrld $20, %xmm5, %xmm5 - vpxor %xmm5, %xmm15, %xmm5 - vpslld $ 12, %xmm6, %xmm15 - vpsrld $20, %xmm6, %xmm6 - vpxor %xmm6, %xmm15, %xmm6 - vpxor %xmm7, %xmm8, %xmm7 - vpxor %xmm4, %xmm9, %xmm4 - vpslld $ 12, %xmm7, %xmm15 - vpsrld $20, %xmm7, %xmm7 - vpxor %xmm7, %xmm15, %xmm7 - vpslld $ 12, %xmm4, %xmm15 - vpsrld $20, %xmm4, %xmm4 - vpxor %xmm4, %xmm15, %xmm4 - vpaddd %xmm0, %xmm5, %xmm0 - vpaddd %xmm1, %xmm6, %xmm1 - vpxor 96(%rsp), %xmm0, %xmm15 - vpxor %xmm12, %xmm1, %xmm12 - vpaddd %xmm2, %xmm7, %xmm2 - vpaddd %xmm3, %xmm4, %xmm3 - vpxor %xmm13, %xmm2, %xmm13 - vpxor %xmm14, %xmm3, %xmm14 - vpshufb 480(%rsp), %xmm15, %xmm15 - vpshufb 480(%rsp), %xmm12, %xmm12 - vpaddd %xmm10, %xmm15, %xmm10 - vpaddd %xmm11, %xmm12, %xmm11 - vpshufb 480(%rsp), %xmm13, %xmm13 - vpshufb 480(%rsp), %xmm14, %xmm14 - vpaddd %xmm8, %xmm13, %xmm8 - vpaddd %xmm9, %xmm14, %xmm9 - vmovdqa %xmm15, 96(%rsp) - vpxor %xmm5, %xmm10, %xmm5 - vpxor %xmm6, %xmm11, %xmm6 - vpslld $ 7, %xmm5, %xmm15 - vpsrld $25, %xmm5, %xmm5 - vpxor %xmm5, %xmm15, %xmm5 - vpslld $ 7, %xmm6, %xmm15 - vpsrld $25, %xmm6, %xmm6 - vpxor %xmm6, %xmm15, %xmm6 - vpxor %xmm7, %xmm8, %xmm7 - vpxor %xmm4, %xmm9, %xmm4 - vpslld $ 7, %xmm7, %xmm15 - vpsrld $25, %xmm7, %xmm7 - vpxor %xmm7, %xmm15, %xmm7 - vpslld $ 7, %xmm4, %xmm15 - vpsrld $25, %xmm4, %xmm4 - vpxor %xmm4, %xmm15, %xmm4 - vmovdqa 96(%rsp), %xmm15 - subq $2, %rax - jnz .Lchacha_blocks_avx2_mainloop2 - vmovdqa %xmm8, 192(%rsp) - vmovdqa %xmm9, 208(%rsp) - vmovdqa %xmm10, 224(%rsp) - vmovdqa %xmm11, 240(%rsp) - vmovdqa %xmm12, 256(%rsp) - vmovdqa %xmm13, 272(%rsp) - vmovdqa %xmm14, 288(%rsp) - vmovdqa %xmm15, 304(%rsp) - vpbroadcastd 0(%rsp), %xmm8 - vpbroadcastd 4+0(%rsp), %xmm9 - vpbroadcastd 8+0(%rsp), %xmm10 - vpbroadcastd 12+0(%rsp), %xmm11 - vpbroadcastd 16(%rsp), %xmm12 - vpbroadcastd 4+16(%rsp), %xmm13 - vpbroadcastd 8+16(%rsp), %xmm14 - vpbroadcastd 12+16(%rsp), %xmm15 - vpaddd %xmm8, %xmm0, %xmm0 - vpaddd %xmm9, %xmm1, %xmm1 - vpaddd %xmm10, %xmm2, %xmm2 - vpaddd %xmm11, %xmm3, %xmm3 - vpaddd %xmm12, %xmm4, %xmm4 - vpaddd %xmm13, %xmm5, %xmm5 - vpaddd %xmm14, %xmm6, %xmm6 - vpaddd %xmm15, %xmm7, %xmm7 - vpunpckldq %xmm1, %xmm0, %xmm8 - vpunpckldq %xmm3, %xmm2, %xmm9 - vpunpckhdq %xmm1, %xmm0, %xmm12 - vpunpckhdq %xmm3, %xmm2, %xmm13 - vpunpckldq %xmm5, %xmm4, %xmm10 - vpunpckldq %xmm7, %xmm6, %xmm11 - vpunpckhdq %xmm5, %xmm4, %xmm14 - vpunpckhdq %xmm7, %xmm6, %xmm15 - vpunpcklqdq %xmm9, %xmm8, %xmm0 - vpunpcklqdq %xmm11, %xmm10, %xmm1 - vpunpckhqdq %xmm9, %xmm8, %xmm2 - vpunpckhqdq %xmm11, %xmm10, %xmm3 - vpunpcklqdq %xmm13, %xmm12, %xmm4 - vpunpcklqdq %xmm15, %xmm14, %xmm5 - vpunpckhqdq %xmm13, %xmm12, %xmm6 - vpunpckhqdq %xmm15, %xmm14, %xmm7 - andq %rsi, %rsi - jz .Lchacha_blocks_avx2_noinput2 - vpxor 0(%rsi), %xmm0, %xmm0 - vpxor 16(%rsi), %xmm1, %xmm1 - vpxor 64(%rsi), %xmm2, %xmm2 - vpxor 80(%rsi), %xmm3, %xmm3 - vpxor 128(%rsi), %xmm4, %xmm4 - vpxor 144(%rsi), %xmm5, %xmm5 - vpxor 192(%rsi), %xmm6, %xmm6 - vpxor 208(%rsi), %xmm7, %xmm7 - vmovdqu %xmm0, 0(%rdx) - vmovdqu %xmm1, 16(%rdx) - vmovdqu %xmm2, 64(%rdx) - vmovdqu %xmm3, 80(%rdx) - vmovdqu %xmm4, 128(%rdx) - vmovdqu %xmm5, 144(%rdx) - vmovdqu %xmm6, 192(%rdx) - vmovdqu %xmm7, 208(%rdx) - vmovdqa 192(%rsp), %xmm0 - vmovdqa 208(%rsp), %xmm1 - vmovdqa 224(%rsp), %xmm2 - vmovdqa 240(%rsp), %xmm3 - vmovdqa 256(%rsp), %xmm4 - vmovdqa 272(%rsp), %xmm5 - vmovdqa 288(%rsp), %xmm6 - vmovdqa 304(%rsp), %xmm7 - vpbroadcastd 32(%rsp), %xmm8 - vpbroadcastd 4+32(%rsp), %xmm9 - vpbroadcastd 8+32(%rsp), %xmm10 - vpbroadcastd 12+32(%rsp), %xmm11 - vmovdqa 128(%rsp), %xmm12 - vmovdqa 160(%rsp), %xmm13 - vpbroadcastd 8+48(%rsp), %xmm14 - vpbroadcastd 12+48(%rsp), %xmm15 - vpaddd %xmm8, %xmm0, %xmm0 - vpaddd %xmm9, %xmm1, %xmm1 - vpaddd %xmm10, %xmm2, %xmm2 - vpaddd %xmm11, %xmm3, %xmm3 - vpaddd %xmm12, %xmm4, %xmm4 - vpaddd %xmm13, %xmm5, %xmm5 - vpaddd %xmm14, %xmm6, %xmm6 - vpaddd %xmm15, %xmm7, %xmm7 - vpunpckldq %xmm1, %xmm0, %xmm8 - vpunpckldq %xmm3, %xmm2, %xmm9 - vpunpckhdq %xmm1, %xmm0, %xmm12 - vpunpckhdq %xmm3, %xmm2, %xmm13 - vpunpckldq %xmm5, %xmm4, %xmm10 - vpunpckldq %xmm7, %xmm6, %xmm11 - vpunpckhdq %xmm5, %xmm4, %xmm14 - vpunpckhdq %xmm7, %xmm6, %xmm15 - vpunpcklqdq %xmm9, %xmm8, %xmm0 - vpunpcklqdq %xmm11, %xmm10, %xmm1 - vpunpckhqdq %xmm9, %xmm8, %xmm2 - vpunpckhqdq %xmm11, %xmm10, %xmm3 - vpunpcklqdq %xmm13, %xmm12, %xmm4 - vpunpcklqdq %xmm15, %xmm14, %xmm5 - vpunpckhqdq %xmm13, %xmm12, %xmm6 - vpunpckhqdq %xmm15, %xmm14, %xmm7 - vpxor 32(%rsi), %xmm0, %xmm0 - vpxor 48(%rsi), %xmm1, %xmm1 - vpxor 96(%rsi), %xmm2, %xmm2 - vpxor 112(%rsi), %xmm3, %xmm3 - vpxor 160(%rsi), %xmm4, %xmm4 - vpxor 176(%rsi), %xmm5, %xmm5 - vpxor 224(%rsi), %xmm6, %xmm6 - vpxor 240(%rsi), %xmm7, %xmm7 - vmovdqu %xmm0, 32(%rdx) - vmovdqu %xmm1, 48(%rdx) - vmovdqu %xmm2, 96(%rdx) - vmovdqu %xmm3, 112(%rdx) - vmovdqu %xmm4, 160(%rdx) - vmovdqu %xmm5, 176(%rdx) - vmovdqu %xmm6, 224(%rdx) - vmovdqu %xmm7, 240(%rdx) - addq $256, %rsi - jmp .Lchacha_blocks_avx2_mainloop2_cont -.Lchacha_blocks_avx2_noinput2: - vmovdqu %xmm0, 0(%rdx) - vmovdqu %xmm1, 16(%rdx) - vmovdqu %xmm2, 64(%rdx) - vmovdqu %xmm3, 80(%rdx) - vmovdqu %xmm4, 128(%rdx) - vmovdqu %xmm5, 144(%rdx) - vmovdqu %xmm6, 192(%rdx) - vmovdqu %xmm7, 208(%rdx) - vmovdqa 192(%rsp), %xmm0 - vmovdqa 208(%rsp), %xmm1 - vmovdqa 224(%rsp), %xmm2 - vmovdqa 240(%rsp), %xmm3 - vmovdqa 256(%rsp), %xmm4 - vmovdqa 272(%rsp), %xmm5 - vmovdqa 288(%rsp), %xmm6 - vmovdqa 304(%rsp), %xmm7 - vpbroadcastd 32(%rsp), %xmm8 - vpbroadcastd 4+32(%rsp), %xmm9 - vpbroadcastd 8+32(%rsp), %xmm10 - vpbroadcastd 12+32(%rsp), %xmm11 - vmovdqa 128(%rsp), %xmm12 - vmovdqa 160(%rsp), %xmm13 - vpbroadcastd 8+48(%rsp), %xmm14 - vpbroadcastd 12+48(%rsp), %xmm15 - vpaddd %xmm8, %xmm0, %xmm0 - vpaddd %xmm9, %xmm1, %xmm1 - vpaddd %xmm10, %xmm2, %xmm2 - vpaddd %xmm11, %xmm3, %xmm3 - vpaddd %xmm12, %xmm4, %xmm4 - vpaddd %xmm13, %xmm5, %xmm5 - vpaddd %xmm14, %xmm6, %xmm6 - vpaddd %xmm15, %xmm7, %xmm7 - vpunpckldq %xmm1, %xmm0, %xmm8 - vpunpckldq %xmm3, %xmm2, %xmm9 - vpunpckhdq %xmm1, %xmm0, %xmm12 - vpunpckhdq %xmm3, %xmm2, %xmm13 - vpunpckldq %xmm5, %xmm4, %xmm10 - vpunpckldq %xmm7, %xmm6, %xmm11 - vpunpckhdq %xmm5, %xmm4, %xmm14 - vpunpckhdq %xmm7, %xmm6, %xmm15 - vpunpcklqdq %xmm9, %xmm8, %xmm0 - vpunpcklqdq %xmm11, %xmm10, %xmm1 - vpunpckhqdq %xmm9, %xmm8, %xmm2 - vpunpckhqdq %xmm11, %xmm10, %xmm3 - vpunpcklqdq %xmm13, %xmm12, %xmm4 - vpunpcklqdq %xmm15, %xmm14, %xmm5 - vpunpckhqdq %xmm13, %xmm12, %xmm6 - vpunpckhqdq %xmm15, %xmm14, %xmm7 - vmovdqu %xmm0, 32(%rdx) - vmovdqu %xmm1, 48(%rdx) - vmovdqu %xmm2, 96(%rdx) - vmovdqu %xmm3, 112(%rdx) - vmovdqu %xmm4, 160(%rdx) - vmovdqu %xmm5, 176(%rdx) - vmovdqu %xmm6, 224(%rdx) - vmovdqu %xmm7, 240(%rdx) -.Lchacha_blocks_avx2_mainloop2_cont: - addq $256, %rdx - subq $256, %rcx - cmp $256, %rcx - jae .Lchacha_blocks_avx2_atleast256 -.Lchacha_blocks_avx2_below256_fixup: - vmovdqa 448(%rsp), %xmm6 - vmovdqa 480(%rsp), %xmm7 - vmovdqa 0(%rsp), %xmm8 - vmovdqa 16(%rsp), %xmm9 - vmovdqa 32(%rsp), %xmm10 - vmovdqa 48(%rsp), %xmm11 - movq $1, %r9 -.Lchacha_blocks_avx2_below256: - vmovq %r9, %xmm5 - andq %rcx, %rcx - jz .Lchacha_blocks_avx2_done - cmpq $64, %rcx - jae .Lchacha_blocks_avx2_above63 - movq %rdx, %r9 - andq %rsi, %rsi - jz .Lchacha_blocks_avx2_noinput3 - movq %rcx, %r10 - movq %rsp, %rdx - addq %r10, %rsi - addq %r10, %rdx - negq %r10 -.Lchacha_blocks_avx2_copyinput: - movb (%rsi, %r10), %al - movb %al, (%rdx, %r10) - incq %r10 - jnz .Lchacha_blocks_avx2_copyinput - movq %rsp, %rsi -.Lchacha_blocks_avx2_noinput3: - movq %rsp, %rdx -.Lchacha_blocks_avx2_above63: - vmovdqa %xmm8, %xmm0 - vmovdqa %xmm9, %xmm1 - vmovdqa %xmm10, %xmm2 - vmovdqa %xmm11, %xmm3 - movq 64(%rsp), %rax -.Lchacha_blocks_avx2_mainloop3: - vpaddd %xmm0, %xmm1, %xmm0 - vpxor %xmm3, %xmm0, %xmm3 - vpshufb %xmm6, %xmm3, %xmm3 - vpaddd %xmm2, %xmm3, %xmm2 - vpxor %xmm1, %xmm2, %xmm1 - vpslld $12, %xmm1, %xmm4 - vpsrld $20, %xmm1, %xmm1 - vpxor %xmm1, %xmm4, %xmm1 - vpaddd %xmm0, %xmm1, %xmm0 - vpxor %xmm3, %xmm0, %xmm3 - vpshufb %xmm7, %xmm3, %xmm3 - vpshufd $0x93, %xmm0, %xmm0 - vpaddd %xmm2, %xmm3, %xmm2 - vpshufd $0x4e, %xmm3, %xmm3 - vpxor %xmm1, %xmm2, %xmm1 - vpshufd $0x39, %xmm2, %xmm2 - vpslld $7, %xmm1, %xmm4 - vpsrld $25, %xmm1, %xmm1 - vpxor %xmm1, %xmm4, %xmm1 - vpaddd %xmm0, %xmm1, %xmm0 - vpxor %xmm3, %xmm0, %xmm3 - vpshufb %xmm6, %xmm3, %xmm3 - vpaddd %xmm2, %xmm3, %xmm2 - vpxor %xmm1, %xmm2, %xmm1 - vpslld $12, %xmm1, %xmm4 - vpsrld $20, %xmm1, %xmm1 - vpxor %xmm1, %xmm4, %xmm1 - vpaddd %xmm0, %xmm1, %xmm0 - vpxor %xmm3, %xmm0, %xmm3 - vpshufb %xmm7, %xmm3, %xmm3 - vpshufd $0x39, %xmm0, %xmm0 - vpaddd %xmm2, %xmm3, %xmm2 - vpshufd $0x4e, %xmm3, %xmm3 - vpxor %xmm1, %xmm2, %xmm1 - vpshufd $0x93, %xmm2, %xmm2 - vpslld $7, %xmm1, %xmm4 - vpsrld $25, %xmm1, %xmm1 - vpxor %xmm1, %xmm4, %xmm1 - subq $2, %rax - jnz .Lchacha_blocks_avx2_mainloop3 - vpaddd %xmm0, %xmm8, %xmm0 - vpaddd %xmm1, %xmm9, %xmm1 - vpaddd %xmm2, %xmm10, %xmm2 - vpaddd %xmm3, %xmm11, %xmm3 - andq %rsi, %rsi - jz .Lchacha_blocks_avx2_noinput4 - vpxor 0(%rsi), %xmm0, %xmm0 - vpxor 16(%rsi), %xmm1, %xmm1 - vpxor 32(%rsi), %xmm2, %xmm2 - vpxor 48(%rsi), %xmm3, %xmm3 - addq $64, %rsi -.Lchacha_blocks_avx2_noinput4: - vmovdqu %xmm0, 0(%rdx) - vmovdqu %xmm1, 16(%rdx) - vmovdqu %xmm2, 32(%rdx) - vmovdqu %xmm3, 48(%rdx) - vpaddq %xmm11, %xmm5, %xmm11 - cmpq $64, %rcx - jbe .Lchacha_blocks_avx2_mainloop3_finishup - addq $64, %rdx - subq $64, %rcx - jmp .Lchacha_blocks_avx2_below256 -.Lchacha_blocks_avx2_mainloop3_finishup: - cmpq $64, %rcx - je .Lchacha_blocks_avx2_done - addq %rcx, %r9 - addq %rcx, %rdx - negq %rcx -.Lchacha_blocks_avx2_copyoutput: - movb (%rdx, %rcx), %al - movb %al, (%r9, %rcx) - incq %rcx - jnz .Lchacha_blocks_avx2_copyoutput -.Lchacha_blocks_avx2_done: - vmovdqu %xmm11, 48(%rdi) - movq %rbp, %rsp - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - vzeroall - movl $(63 + 512), %eax - ret -ELF(.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;) - -.align 16 -.LC: -.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ -.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ - -#endif /*defined(USE_CHACHA20)*/ -#endif /*__x86_64*/ diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S deleted file mode 100644 index 2b9842c1..00000000 --- a/cipher/chacha20-sse2-amd64.S +++ /dev/null @@ -1,659 +0,0 @@ -/* chacha20-sse2-amd64.S - AMD64/SSE2 implementation of ChaCha20 - * - * Copyright (C) 2014 Jussi Kivilinna - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see . - */ - -/* - * Based on public domain implementation by Andrew Moon at - * https://github.com/floodyberry/chacha-opt - */ - -#ifdef __x86_64__ -#include - -#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && USE_CHACHA20 - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -.text - -.align 8 -.globl _gcry_chacha20_amd64_sse2_blocks -ELF(.type _gcry_chacha20_amd64_sse2_blocks,@function;) -_gcry_chacha20_amd64_sse2_blocks: -.Lchacha_blocks_sse2_local: - pushq %rbx - pushq %rbp - movq %rsp, %rbp - andq $~63, %rsp - subq $512, %rsp - movdqu (%rdi), %xmm8 - movdqu 16(%rdi), %xmm9 - movdqu 32(%rdi), %xmm10 - movdqu 48(%rdi), %xmm11 - movq $20, %rax - movq $1, %r9 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movq %rax, 64(%rsp) - cmpq $256, %rcx - jb .Lchacha_blocks_sse2_below256 - pshufd $0x00, %xmm8, %xmm0 - pshufd $0x55, %xmm8, %xmm1 - pshufd $0xaa, %xmm8, %xmm2 - pshufd $0xff, %xmm8, %xmm3 - movdqa %xmm0, 128(%rsp) - movdqa %xmm1, 144(%rsp) - movdqa %xmm2, 160(%rsp) - movdqa %xmm3, 176(%rsp) - pshufd $0x00, %xmm9, %xmm0 - pshufd $0x55, %xmm9, %xmm1 - pshufd $0xaa, %xmm9, %xmm2 - pshufd $0xff, %xmm9, %xmm3 - movdqa %xmm0, 192(%rsp) - movdqa %xmm1, 208(%rsp) - movdqa %xmm2, 224(%rsp) - movdqa %xmm3, 240(%rsp) - pshufd $0x00, %xmm10, %xmm0 - pshufd $0x55, %xmm10, %xmm1 - pshufd $0xaa, %xmm10, %xmm2 - pshufd $0xff, %xmm10, %xmm3 - movdqa %xmm0, 256(%rsp) - movdqa %xmm1, 272(%rsp) - movdqa %xmm2, 288(%rsp) - movdqa %xmm3, 304(%rsp) - pshufd $0xaa, %xmm11, %xmm0 - pshufd $0xff, %xmm11, %xmm1 - movdqa %xmm0, 352(%rsp) - movdqa %xmm1, 368(%rsp) - jmp .Lchacha_blocks_sse2_atleast256 -.p2align 6,,63 -.Lchacha_blocks_sse2_atleast256: - movq 48(%rsp), %rax - leaq 1(%rax), %r8 - leaq 2(%rax), %r9 - leaq 3(%rax), %r10 - leaq 4(%rax), %rbx - movl %eax, 320(%rsp) - movl %r8d, 4+320(%rsp) - movl %r9d, 8+320(%rsp) - movl %r10d, 12+320(%rsp) - shrq $32, %rax - shrq $32, %r8 - shrq $32, %r9 - shrq $32, %r10 - movl %eax, 336(%rsp) - movl %r8d, 4+336(%rsp) - movl %r9d, 8+336(%rsp) - movl %r10d, 12+336(%rsp) - movq %rbx, 48(%rsp) - movq 64(%rsp), %rax - movdqa 128(%rsp), %xmm0 - movdqa 144(%rsp), %xmm1 - movdqa 160(%rsp), %xmm2 - movdqa 176(%rsp), %xmm3 - movdqa 192(%rsp), %xmm4 - movdqa 208(%rsp), %xmm5 - movdqa 224(%rsp), %xmm6 - movdqa 240(%rsp), %xmm7 - movdqa 256(%rsp), %xmm8 - movdqa 272(%rsp), %xmm9 - movdqa 288(%rsp), %xmm10 - movdqa 304(%rsp), %xmm11 - movdqa 320(%rsp), %xmm12 - movdqa 336(%rsp), %xmm13 - movdqa 352(%rsp), %xmm14 - movdqa 368(%rsp), %xmm15 -.Lchacha_blocks_sse2_mainloop1: - paddd %xmm4, %xmm0 - paddd %xmm5, %xmm1 - pxor %xmm0, %xmm12 - pxor %xmm1, %xmm13 - paddd %xmm6, %xmm2 - paddd %xmm7, %xmm3 - movdqa %xmm6, 96(%rsp) - pxor %xmm2, %xmm14 - pxor %xmm3, %xmm15 - pshuflw $0xb1,%xmm12,%xmm12 - pshufhw $0xb1,%xmm12,%xmm12 - pshuflw $0xb1,%xmm13,%xmm13 - pshufhw $0xb1,%xmm13,%xmm13 - pshuflw $0xb1,%xmm14,%xmm14 - pshufhw $0xb1,%xmm14,%xmm14 - pshuflw $0xb1,%xmm15,%xmm15 - pshufhw $0xb1,%xmm15,%xmm15 - paddd %xmm12, %xmm8 - paddd %xmm13, %xmm9 - paddd %xmm14, %xmm10 - paddd %xmm15, %xmm11 - movdqa %xmm12, 112(%rsp) - pxor %xmm8, %xmm4 - pxor %xmm9, %xmm5 - movdqa 96(%rsp), %xmm6 - movdqa %xmm4, %xmm12 - pslld $ 12, %xmm4 - psrld $20, %xmm12 - pxor %xmm12, %xmm4 - movdqa %xmm5, %xmm12 - pslld $ 12, %xmm5 - psrld $20, %xmm12 - pxor %xmm12, %xmm5 - pxor %xmm10, %xmm6 - pxor %xmm11, %xmm7 - movdqa %xmm6, %xmm12 - pslld $ 12, %xmm6 - psrld $20, %xmm12 - pxor %xmm12, %xmm6 - movdqa %xmm7, %xmm12 - pslld $ 12, %xmm7 - psrld $20, %xmm12 - pxor %xmm12, %xmm7 - movdqa 112(%rsp), %xmm12 - paddd %xmm4, %xmm0 - paddd %xmm5, %xmm1 - pxor %xmm0, %xmm12 - pxor %xmm1, %xmm13 - paddd %xmm6, %xmm2 - paddd %xmm7, %xmm3 - movdqa %xmm6, 96(%rsp) - pxor %xmm2, %xmm14 - pxor %xmm3, %xmm15 - movdqa %xmm12, %xmm6 - pslld $ 8, %xmm12 - psrld $24, %xmm6 - pxor %xmm6, %xmm12 - movdqa %xmm13, %xmm6 - pslld $ 8, %xmm13 - psrld $24, %xmm6 - pxor %xmm6, %xmm13 - paddd %xmm12, %xmm8 - paddd %xmm13, %xmm9 - movdqa %xmm14, %xmm6 - pslld $ 8, %xmm14 - psrld $24, %xmm6 - pxor %xmm6, %xmm14 - movdqa %xmm15, %xmm6 - pslld $ 8, %xmm15 - psrld $24, %xmm6 - pxor %xmm6, %xmm15 - paddd %xmm14, %xmm10 - paddd %xmm15, %xmm11 - movdqa %xmm12, 112(%rsp) - pxor %xmm8, %xmm4 - pxor %xmm9, %xmm5 - movdqa 96(%rsp), %xmm6 - movdqa %xmm4, %xmm12 - pslld $ 7, %xmm4 - psrld $25, %xmm12 - pxor %xmm12, %xmm4 - movdqa %xmm5, %xmm12 - pslld $ 7, %xmm5 - psrld $25, %xmm12 - pxor %xmm12, %xmm5 - pxor %xmm10, %xmm6 - pxor %xmm11, %xmm7 - movdqa %xmm6, %xmm12 - pslld $ 7, %xmm6 - psrld $25, %xmm12 - pxor %xmm12, %xmm6 - movdqa %xmm7, %xmm12 - pslld $ 7, %xmm7 - psrld $25, %xmm12 - pxor %xmm12, %xmm7 - movdqa 112(%rsp), %xmm12 - paddd %xmm5, %xmm0 - paddd %xmm6, %xmm1 - pxor %xmm0, %xmm15 - pxor %xmm1, %xmm12 - paddd %xmm7, %xmm2 - paddd %xmm4, %xmm3 - movdqa %xmm7, 96(%rsp) - pxor %xmm2, %xmm13 - pxor %xmm3, %xmm14 - pshuflw $0xb1,%xmm15,%xmm15 - pshufhw $0xb1,%xmm15,%xmm15 - pshuflw $0xb1,%xmm12,%xmm12 - pshufhw $0xb1,%xmm12,%xmm12 - pshuflw $0xb1,%xmm13,%xmm13 - pshufhw $0xb1,%xmm13,%xmm13 - pshuflw $0xb1,%xmm14,%xmm14 - pshufhw $0xb1,%xmm14,%xmm14 - paddd %xmm15, %xmm10 - paddd %xmm12, %xmm11 - paddd %xmm13, %xmm8 - paddd %xmm14, %xmm9 - movdqa %xmm15, 112(%rsp) - pxor %xmm10, %xmm5 - pxor %xmm11, %xmm6 - movdqa 96(%rsp), %xmm7 - movdqa %xmm5, %xmm15 - pslld $ 12, %xmm5 - psrld $20, %xmm15 - pxor %xmm15, %xmm5 - movdqa %xmm6, %xmm15 - pslld $ 12, %xmm6 - psrld $20, %xmm15 - pxor %xmm15, %xmm6 - pxor %xmm8, %xmm7 - pxor %xmm9, %xmm4 - movdqa %xmm7, %xmm15 - pslld $ 12, %xmm7 - psrld $20, %xmm15 - pxor %xmm15, %xmm7 - movdqa %xmm4, %xmm15 - pslld $ 12, %xmm4 - psrld $20, %xmm15 - pxor %xmm15, %xmm4 - movdqa 112(%rsp), %xmm15 - paddd %xmm5, %xmm0 - paddd %xmm6, %xmm1 - pxor %xmm0, %xmm15 - pxor %xmm1, %xmm12 - paddd %xmm7, %xmm2 - paddd %xmm4, %xmm3 - movdqa %xmm7, 96(%rsp) - pxor %xmm2, %xmm13 - pxor %xmm3, %xmm14 - movdqa %xmm15, %xmm7 - pslld $ 8, %xmm15 - psrld $24, %xmm7 - pxor %xmm7, %xmm15 - movdqa %xmm12, %xmm7 - pslld $ 8, %xmm12 - psrld $24, %xmm7 - pxor %xmm7, %xmm12 - paddd %xmm15, %xmm10 - paddd %xmm12, %xmm11 - movdqa %xmm13, %xmm7 - pslld $ 8, %xmm13 - psrld $24, %xmm7 - pxor %xmm7, %xmm13 - movdqa %xmm14, %xmm7 - pslld $ 8, %xmm14 - psrld $24, %xmm7 - pxor %xmm7, %xmm14 - paddd %xmm13, %xmm8 - paddd %xmm14, %xmm9 - movdqa %xmm15, 112(%rsp) - pxor %xmm10, %xmm5 - pxor %xmm11, %xmm6 - movdqa 96(%rsp), %xmm7 - movdqa %xmm5, %xmm15 - pslld $ 7, %xmm5 - psrld $25, %xmm15 - pxor %xmm15, %xmm5 - movdqa %xmm6, %xmm15 - pslld $ 7, %xmm6 - psrld $25, %xmm15 - pxor %xmm15, %xmm6 - pxor %xmm8, %xmm7 - pxor %xmm9, %xmm4 - movdqa %xmm7, %xmm15 - pslld $ 7, %xmm7 - psrld $25, %xmm15 - pxor %xmm15, %xmm7 - movdqa %xmm4, %xmm15 - pslld $ 7, %xmm4 - psrld $25, %xmm15 - pxor %xmm15, %xmm4 - movdqa 112(%rsp), %xmm15 - subq $2, %rax - jnz .Lchacha_blocks_sse2_mainloop1 - paddd 128(%rsp), %xmm0 - paddd 144(%rsp), %xmm1 - paddd 160(%rsp), %xmm2 - paddd 176(%rsp), %xmm3 - paddd 192(%rsp), %xmm4 - paddd 208(%rsp), %xmm5 - paddd 224(%rsp), %xmm6 - paddd 240(%rsp), %xmm7 - paddd 256(%rsp), %xmm8 - paddd 272(%rsp), %xmm9 - paddd 288(%rsp), %xmm10 - paddd 304(%rsp), %xmm11 - paddd 320(%rsp), %xmm12 - paddd 336(%rsp), %xmm13 - paddd 352(%rsp), %xmm14 - paddd 368(%rsp), %xmm15 - movdqa %xmm8, 384(%rsp) - movdqa %xmm9, 400(%rsp) - movdqa %xmm10, 416(%rsp) - movdqa %xmm11, 432(%rsp) - movdqa %xmm12, 448(%rsp) - movdqa %xmm13, 464(%rsp) - movdqa %xmm14, 480(%rsp) - movdqa %xmm15, 496(%rsp) - movdqa %xmm0, %xmm8 - movdqa %xmm2, %xmm9 - movdqa %xmm4, %xmm10 - movdqa %xmm6, %xmm11 - punpckhdq %xmm1, %xmm0 - punpckhdq %xmm3, %xmm2 - punpckhdq %xmm5, %xmm4 - punpckhdq %xmm7, %xmm6 - punpckldq %xmm1, %xmm8 - punpckldq %xmm3, %xmm9 - punpckldq %xmm5, %xmm10 - punpckldq %xmm7, %xmm11 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm3 - movdqa %xmm8, %xmm5 - movdqa %xmm10, %xmm7 - punpckhqdq %xmm2, %xmm0 - punpckhqdq %xmm6, %xmm4 - punpckhqdq %xmm9, %xmm8 - punpckhqdq %xmm11, %xmm10 - punpcklqdq %xmm2, %xmm1 - punpcklqdq %xmm6, %xmm3 - punpcklqdq %xmm9, %xmm5 - punpcklqdq %xmm11, %xmm7 - andq %rsi, %rsi - jz .Lchacha_blocks_sse2_noinput1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm6 - movdqu 64(%rsi), %xmm9 - movdqu 80(%rsi), %xmm11 - movdqu 128(%rsi), %xmm12 - movdqu 144(%rsi), %xmm13 - movdqu 192(%rsi), %xmm14 - movdqu 208(%rsi), %xmm15 - pxor %xmm2, %xmm5 - pxor %xmm6, %xmm7 - pxor %xmm9, %xmm8 - pxor %xmm11, %xmm10 - pxor %xmm12, %xmm1 - pxor %xmm13, %xmm3 - pxor %xmm14, %xmm0 - pxor %xmm15, %xmm4 - movdqu %xmm5, 0(%rdx) - movdqu %xmm7, 16(%rdx) - movdqu %xmm8, 64(%rdx) - movdqu %xmm10, 80(%rdx) - movdqu %xmm1, 128(%rdx) - movdqu %xmm3, 144(%rdx) - movdqu %xmm0, 192(%rdx) - movdqu %xmm4, 208(%rdx) - movdqa 384(%rsp), %xmm0 - movdqa 400(%rsp), %xmm1 - movdqa 416(%rsp), %xmm2 - movdqa 432(%rsp), %xmm3 - movdqa 448(%rsp), %xmm4 - movdqa 464(%rsp), %xmm5 - movdqa 480(%rsp), %xmm6 - movdqa 496(%rsp), %xmm7 - movdqa %xmm0, %xmm8 - movdqa %xmm2, %xmm9 - movdqa %xmm4, %xmm10 - movdqa %xmm6, %xmm11 - punpckldq %xmm1, %xmm8 - punpckldq %xmm3, %xmm9 - punpckhdq %xmm1, %xmm0 - punpckhdq %xmm3, %xmm2 - punpckldq %xmm5, %xmm10 - punpckldq %xmm7, %xmm11 - punpckhdq %xmm5, %xmm4 - punpckhdq %xmm7, %xmm6 - movdqa %xmm8, %xmm1 - movdqa %xmm0, %xmm3 - movdqa %xmm10, %xmm5 - movdqa %xmm4, %xmm7 - punpcklqdq %xmm9, %xmm1 - punpcklqdq %xmm11, %xmm5 - punpckhqdq %xmm9, %xmm8 - punpckhqdq %xmm11, %xmm10 - punpcklqdq %xmm2, %xmm3 - punpcklqdq %xmm6, %xmm7 - punpckhqdq %xmm2, %xmm0 - punpckhqdq %xmm6, %xmm4 - movdqu 32(%rsi), %xmm2 - movdqu 48(%rsi), %xmm6 - movdqu 96(%rsi), %xmm9 - movdqu 112(%rsi), %xmm11 - movdqu 160(%rsi), %xmm12 - movdqu 176(%rsi), %xmm13 - movdqu 224(%rsi), %xmm14 - movdqu 240(%rsi), %xmm15 - pxor %xmm2, %xmm1 - pxor %xmm6, %xmm5 - pxor %xmm9, %xmm8 - pxor %xmm11, %xmm10 - pxor %xmm12, %xmm3 - pxor %xmm13, %xmm7 - pxor %xmm14, %xmm0 - pxor %xmm15, %xmm4 - movdqu %xmm1, 32(%rdx) - movdqu %xmm5, 48(%rdx) - movdqu %xmm8, 96(%rdx) - movdqu %xmm10, 112(%rdx) - movdqu %xmm3, 160(%rdx) - movdqu %xmm7, 176(%rdx) - movdqu %xmm0, 224(%rdx) - movdqu %xmm4, 240(%rdx) - addq $256, %rsi - jmp .Lchacha_blocks_sse2_mainloop_cont -.Lchacha_blocks_sse2_noinput1: - movdqu %xmm5, 0(%rdx) - movdqu %xmm7, 16(%rdx) - movdqu %xmm8, 64(%rdx) - movdqu %xmm10, 80(%rdx) - movdqu %xmm1, 128(%rdx) - movdqu %xmm3, 144(%rdx) - movdqu %xmm0, 192(%rdx) - movdqu %xmm4, 208(%rdx) - movdqa 384(%rsp), %xmm0 - movdqa 400(%rsp), %xmm1 - movdqa 416(%rsp), %xmm2 - movdqa 432(%rsp), %xmm3 - movdqa 448(%rsp), %xmm4 - movdqa 464(%rsp), %xmm5 - movdqa 480(%rsp), %xmm6 - movdqa 496(%rsp), %xmm7 - movdqa %xmm0, %xmm8 - movdqa %xmm2, %xmm9 - movdqa %xmm4, %xmm10 - movdqa %xmm6, %xmm11 - punpckldq %xmm1, %xmm8 - punpckldq %xmm3, %xmm9 - punpckhdq %xmm1, %xmm0 - punpckhdq %xmm3, %xmm2 - punpckldq %xmm5, %xmm10 - punpckldq %xmm7, %xmm11 - punpckhdq %xmm5, %xmm4 - punpckhdq %xmm7, %xmm6 - movdqa %xmm8, %xmm1 - movdqa %xmm0, %xmm3 - movdqa %xmm10, %xmm5 - movdqa %xmm4, %xmm7 - punpcklqdq %xmm9, %xmm1 - punpcklqdq %xmm11, %xmm5 - punpckhqdq %xmm9, %xmm8 - punpckhqdq %xmm11, %xmm10 - punpcklqdq %xmm2, %xmm3 - punpcklqdq %xmm6, %xmm7 - punpckhqdq %xmm2, %xmm0 - punpckhqdq %xmm6, %xmm4 - movdqu %xmm1, 32(%rdx) - movdqu %xmm5, 48(%rdx) - movdqu %xmm8, 96(%rdx) - movdqu %xmm10, 112(%rdx) - movdqu %xmm3, 160(%rdx) - movdqu %xmm7, 176(%rdx) - movdqu %xmm0, 224(%rdx) - movdqu %xmm4, 240(%rdx) -.Lchacha_blocks_sse2_mainloop_cont: - addq $256, %rdx - subq $256, %rcx - cmp $256, %rcx - jae .Lchacha_blocks_sse2_atleast256 - movdqa 0(%rsp), %xmm8 - movdqa 16(%rsp), %xmm9 - movdqa 32(%rsp), %xmm10 - movdqa 48(%rsp), %xmm11 - movq $1, %r9 -.Lchacha_blocks_sse2_below256: - movq %r9, %xmm5 - andq %rcx, %rcx - jz .Lchacha_blocks_sse2_done - cmpq $64, %rcx - jae .Lchacha_blocks_sse2_above63 - movq %rdx, %r9 - andq %rsi, %rsi - jz .Lchacha_blocks_sse2_noinput2 - movq %rcx, %r10 - movq %rsp, %rdx - addq %r10, %rsi - addq %r10, %rdx - negq %r10 -.Lchacha_blocks_sse2_copyinput: - movb (%rsi, %r10), %al - movb %al, (%rdx, %r10) - incq %r10 - jnz .Lchacha_blocks_sse2_copyinput - movq %rsp, %rsi -.Lchacha_blocks_sse2_noinput2: - movq %rsp, %rdx -.Lchacha_blocks_sse2_above63: - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - movq 64(%rsp), %rax -.Lchacha_blocks_sse2_mainloop2: - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - pshuflw $0xb1,%xmm3,%xmm3 - pshufhw $0xb1,%xmm3,%xmm3 - paddd %xmm3, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm1,%xmm4 - pslld $12, %xmm1 - psrld $20, %xmm4 - pxor %xmm4, %xmm1 - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - movdqa %xmm3,%xmm4 - pslld $8, %xmm3 - psrld $24, %xmm4 - pshufd $0x93,%xmm0,%xmm0 - pxor %xmm4, %xmm3 - paddd %xmm3, %xmm2 - pshufd $0x4e,%xmm3,%xmm3 - pxor %xmm2, %xmm1 - pshufd $0x39,%xmm2,%xmm2 - movdqa %xmm1,%xmm4 - pslld $7, %xmm1 - psrld $25, %xmm4 - pxor %xmm4, %xmm1 - subq $2, %rax - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - pshuflw $0xb1,%xmm3,%xmm3 - pshufhw $0xb1,%xmm3,%xmm3 - paddd %xmm3, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm1,%xmm4 - pslld $12, %xmm1 - psrld $20, %xmm4 - pxor %xmm4, %xmm1 - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - movdqa %xmm3,%xmm4 - pslld $8, %xmm3 - psrld $24, %xmm4 - pshufd $0x39,%xmm0,%xmm0 - pxor %xmm4, %xmm3 - paddd %xmm3, %xmm2 - pshufd $0x4e,%xmm3,%xmm3 - pxor %xmm2, %xmm1 - pshufd $0x93,%xmm2,%xmm2 - movdqa %xmm1,%xmm4 - pslld $7, %xmm1 - psrld $25, %xmm4 - pxor %xmm4, %xmm1 - jnz .Lchacha_blocks_sse2_mainloop2 - paddd %xmm8, %xmm0 - paddd %xmm9, %xmm1 - paddd %xmm10, %xmm2 - paddd %xmm11, %xmm3 - andq %rsi, %rsi - jz .Lchacha_blocks_sse2_noinput3 - movdqu 0(%rsi), %xmm12 - movdqu 16(%rsi), %xmm13 - movdqu 32(%rsi), %xmm14 - movdqu 48(%rsi), %xmm15 - pxor %xmm12, %xmm0 - pxor %xmm13, %xmm1 - pxor %xmm14, %xmm2 - pxor %xmm15, %xmm3 - addq $64, %rsi -.Lchacha_blocks_sse2_noinput3: - movdqu %xmm0, 0(%rdx) - movdqu %xmm1, 16(%rdx) - movdqu %xmm2, 32(%rdx) - movdqu %xmm3, 48(%rdx) - paddq %xmm5, %xmm11 - cmpq $64, %rcx - jbe .Lchacha_blocks_sse2_mainloop2_finishup - addq $64, %rdx - subq $64, %rcx - jmp .Lchacha_blocks_sse2_below256 -.Lchacha_blocks_sse2_mainloop2_finishup: - cmpq $64, %rcx - je .Lchacha_blocks_sse2_done - addq %rcx, %r9 - addq %rcx, %rdx - negq %rcx -.Lchacha_blocks_sse2_copyoutput: - movb (%rdx, %rcx), %al - movb %al, (%r9, %rcx) - incq %rcx - jnz .Lchacha_blocks_sse2_copyoutput -.Lchacha_blocks_sse2_done: - movdqu %xmm11, 48(%rdi) - movq %rbp, %rsp - pxor %xmm15, %xmm15 - pxor %xmm7, %xmm7 - pxor %xmm14, %xmm14 - pxor %xmm6, %xmm6 - pxor %xmm13, %xmm13 - pxor %xmm5, %xmm5 - pxor %xmm12, %xmm12 - pxor %xmm4, %xmm4 - popq %rbp - popq %rbx - movl $(63 + 512 + 16), %eax - pxor %xmm11, %xmm11 - pxor %xmm3, %xmm3 - pxor %xmm10, %xmm10 - pxor %xmm2, %xmm2 - pxor %xmm9, %xmm9 - pxor %xmm1, %xmm1 - pxor %xmm8, %xmm8 - pxor %xmm0, %xmm0 - ret -ELF(.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;) - -#endif /*defined(USE_CHACHA20)*/ -#endif /*__x86_64*/ diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S deleted file mode 100644 index c04010e7..00000000 --- a/cipher/chacha20-ssse3-amd64.S +++ /dev/null @@ -1,632 +0,0 @@ -/* chacha20-ssse3-amd64.S - AMD64/SSSE3 implementation of ChaCha20 - * - * Copyright (C) 2014 Jussi Kivilinna - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see . - */ - -/* - * Based on public domain implementation by Andrew Moon at - * https://github.com/floodyberry/chacha-opt - */ - -#ifdef __x86_64__ -#include - -#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ - defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20 - -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -.text - -.align 8 -.globl _gcry_chacha20_amd64_ssse3_blocks -ELF(.type _gcry_chacha20_amd64_ssse3_blocks,@function;) -_gcry_chacha20_amd64_ssse3_blocks: -.Lchacha_blocks_ssse3_local: - pushq %rbx - pushq %rbp - movq %rsp, %rbp - andq $~63, %rsp - subq $512, %rsp - leaq .LC RIP, %rax - movdqa 0(%rax), %xmm6 - movdqa 16(%rax), %xmm7 - movdqu 0(%rdi), %xmm8 - movdqu 16(%rdi), %xmm9 - movdqu 32(%rdi), %xmm10 - movdqu 48(%rdi), %xmm11 - movl $20, %eax - movq $1, %r9 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movdqa %xmm6, 80(%rsp) - movdqa %xmm7, 96(%rsp) - movq %rax, 64(%rsp) - cmpq $256, %rcx - jb .Lchacha_blocks_ssse3_below256 - pshufd $0x00, %xmm8, %xmm0 - pshufd $0x55, %xmm8, %xmm1 - pshufd $0xaa, %xmm8, %xmm2 - pshufd $0xff, %xmm8, %xmm3 - movdqa %xmm0, 128(%rsp) - movdqa %xmm1, 144(%rsp) - movdqa %xmm2, 160(%rsp) - movdqa %xmm3, 176(%rsp) - pshufd $0x00, %xmm9, %xmm0 - pshufd $0x55, %xmm9, %xmm1 - pshufd $0xaa, %xmm9, %xmm2 - pshufd $0xff, %xmm9, %xmm3 - movdqa %xmm0, 192(%rsp) - movdqa %xmm1, 208(%rsp) - movdqa %xmm2, 224(%rsp) - movdqa %xmm3, 240(%rsp) - pshufd $0x00, %xmm10, %xmm0 - pshufd $0x55, %xmm10, %xmm1 - pshufd $0xaa, %xmm10, %xmm2 - pshufd $0xff, %xmm10, %xmm3 - movdqa %xmm0, 256(%rsp) - movdqa %xmm1, 272(%rsp) - movdqa %xmm2, 288(%rsp) - movdqa %xmm3, 304(%rsp) - pshufd $0xaa, %xmm11, %xmm0 - pshufd $0xff, %xmm11, %xmm1 - movdqa %xmm0, 352(%rsp) - movdqa %xmm1, 368(%rsp) - jmp .Lchacha_blocks_ssse3_atleast256 -.p2align 6,,63 - # align to 4 mod 64 - nop;nop;nop;nop; -.Lchacha_blocks_ssse3_atleast256: - movq 48(%rsp), %rax - leaq 1(%rax), %r8 - leaq 2(%rax), %r9 - leaq 3(%rax), %r10 - leaq 4(%rax), %rbx - movl %eax, 320(%rsp) - movl %r8d, 4+320(%rsp) - movl %r9d, 8+320(%rsp) - movl %r10d, 12+320(%rsp) - shrq $32, %rax - shrq $32, %r8 - shrq $32, %r9 - shrq $32, %r10 - movl %eax, 336(%rsp) - movl %r8d, 4+336(%rsp) - movl %r9d, 8+336(%rsp) - movl %r10d, 12+336(%rsp) - movq %rbx, 48(%rsp) - movq 64(%rsp), %rax - movdqa 128(%rsp), %xmm0 - movdqa 144(%rsp), %xmm1 - movdqa 160(%rsp), %xmm2 - movdqa 176(%rsp), %xmm3 - movdqa 192(%rsp), %xmm4 - movdqa 208(%rsp), %xmm5 - movdqa 224(%rsp), %xmm6 - movdqa 240(%rsp), %xmm7 - movdqa 256(%rsp), %xmm8 - movdqa 272(%rsp), %xmm9 - movdqa 288(%rsp), %xmm10 - movdqa 304(%rsp), %xmm11 - movdqa 320(%rsp), %xmm12 - movdqa 336(%rsp), %xmm13 - movdqa 352(%rsp), %xmm14 - movdqa 368(%rsp), %xmm15 -.Lchacha_blocks_ssse3_mainloop1: - paddd %xmm4, %xmm0 - paddd %xmm5, %xmm1 - pxor %xmm0, %xmm12 - pxor %xmm1, %xmm13 - paddd %xmm6, %xmm2 - paddd %xmm7, %xmm3 - pxor %xmm2, %xmm14 - pxor %xmm3, %xmm15 - pshufb 80(%rsp), %xmm12 - pshufb 80(%rsp), %xmm13 - paddd %xmm12, %xmm8 - paddd %xmm13, %xmm9 - pshufb 80(%rsp), %xmm14 - pshufb 80(%rsp), %xmm15 - paddd %xmm14, %xmm10 - paddd %xmm15, %xmm11 - movdqa %xmm12, 112(%rsp) - pxor %xmm8, %xmm4 - pxor %xmm9, %xmm5 - movdqa %xmm4, %xmm12 - pslld $ 12, %xmm4 - psrld $20, %xmm12 - pxor %xmm12, %xmm4 - movdqa %xmm5, %xmm12 - pslld $ 12, %xmm5 - psrld $20, %xmm12 - pxor %xmm12, %xmm5 - pxor %xmm10, %xmm6 - pxor %xmm11, %xmm7 - movdqa %xmm6, %xmm12 - pslld $ 12, %xmm6 - psrld $20, %xmm12 - pxor %xmm12, %xmm6 - movdqa %xmm7, %xmm12 - pslld $ 12, %xmm7 - psrld $20, %xmm12 - pxor %xmm12, %xmm7 - movdqa 112(%rsp), %xmm12 - paddd %xmm4, %xmm0 - paddd %xmm5, %xmm1 - pxor %xmm0, %xmm12 - pxor %xmm1, %xmm13 - paddd %xmm6, %xmm2 - paddd %xmm7, %xmm3 - pxor %xmm2, %xmm14 - pxor %xmm3, %xmm15 - pshufb 96(%rsp), %xmm12 - pshufb 96(%rsp), %xmm13 - paddd %xmm12, %xmm8 - paddd %xmm13, %xmm9 - pshufb 96(%rsp), %xmm14 - pshufb 96(%rsp), %xmm15 - paddd %xmm14, %xmm10 - paddd %xmm15, %xmm11 - movdqa %xmm12, 112(%rsp) - pxor %xmm8, %xmm4 - pxor %xmm9, %xmm5 - movdqa %xmm4, %xmm12 - pslld $ 7, %xmm4 - psrld $25, %xmm12 - pxor %xmm12, %xmm4 - movdqa %xmm5, %xmm12 - pslld $ 7, %xmm5 - psrld $25, %xmm12 - pxor %xmm12, %xmm5 - pxor %xmm10, %xmm6 - pxor %xmm11, %xmm7 - movdqa %xmm6, %xmm12 - pslld $ 7, %xmm6 - psrld $25, %xmm12 - pxor %xmm12, %xmm6 - movdqa %xmm7, %xmm12 - pslld $ 7, %xmm7 - psrld $25, %xmm12 - pxor %xmm12, %xmm7 - movdqa 112(%rsp), %xmm12 - paddd %xmm5, %xmm0 - paddd %xmm6, %xmm1 - pxor %xmm0, %xmm15 - pxor %xmm1, %xmm12 - paddd %xmm7, %xmm2 - paddd %xmm4, %xmm3 - pxor %xmm2, %xmm13 - pxor %xmm3, %xmm14 - pshufb 80(%rsp), %xmm15 - pshufb 80(%rsp), %xmm12 - paddd %xmm15, %xmm10 - paddd %xmm12, %xmm11 - pshufb 80(%rsp), %xmm13 - pshufb 80(%rsp), %xmm14 - paddd %xmm13, %xmm8 - paddd %xmm14, %xmm9 - movdqa %xmm15, 112(%rsp) - pxor %xmm10, %xmm5 - pxor %xmm11, %xmm6 - movdqa %xmm5, %xmm15 - pslld $ 12, %xmm5 - psrld $20, %xmm15 - pxor %xmm15, %xmm5 - movdqa %xmm6, %xmm15 - pslld $ 12, %xmm6 - psrld $20, %xmm15 - pxor %xmm15, %xmm6 - pxor %xmm8, %xmm7 - pxor %xmm9, %xmm4 - movdqa %xmm7, %xmm15 - pslld $ 12, %xmm7 - psrld $20, %xmm15 - pxor %xmm15, %xmm7 - movdqa %xmm4, %xmm15 - pslld $ 12, %xmm4 - psrld $20, %xmm15 - pxor %xmm15, %xmm4 - movdqa 112(%rsp), %xmm15 - paddd %xmm5, %xmm0 - paddd %xmm6, %xmm1 - pxor %xmm0, %xmm15 - pxor %xmm1, %xmm12 - paddd %xmm7, %xmm2 - paddd %xmm4, %xmm3 - pxor %xmm2, %xmm13 - pxor %xmm3, %xmm14 - pshufb 96(%rsp), %xmm15 - pshufb 96(%rsp), %xmm12 - paddd %xmm15, %xmm10 - paddd %xmm12, %xmm11 - pshufb 96(%rsp), %xmm13 - pshufb 96(%rsp), %xmm14 - paddd %xmm13, %xmm8 - paddd %xmm14, %xmm9 - movdqa %xmm15, 112(%rsp) - pxor %xmm10, %xmm5 - pxor %xmm11, %xmm6 - movdqa %xmm5, %xmm15 - pslld $ 7, %xmm5 - psrld $25, %xmm15 - pxor %xmm15, %xmm5 - movdqa %xmm6, %xmm15 - pslld $ 7, %xmm6 - psrld $25, %xmm15 - pxor %xmm15, %xmm6 - pxor %xmm8, %xmm7 - pxor %xmm9, %xmm4 - movdqa %xmm7, %xmm15 - pslld $ 7, %xmm7 - psrld $25, %xmm15 - pxor %xmm15, %xmm7 - movdqa %xmm4, %xmm15 - pslld $ 7, %xmm4 - psrld $25, %xmm15 - pxor %xmm15, %xmm4 - subq $2, %rax - movdqa 112(%rsp), %xmm15 - jnz .Lchacha_blocks_ssse3_mainloop1 - paddd 128(%rsp), %xmm0 - paddd 144(%rsp), %xmm1 - paddd 160(%rsp), %xmm2 - paddd 176(%rsp), %xmm3 - paddd 192(%rsp), %xmm4 - paddd 208(%rsp), %xmm5 - paddd 224(%rsp), %xmm6 - paddd 240(%rsp), %xmm7 - paddd 256(%rsp), %xmm8 - paddd 272(%rsp), %xmm9 - paddd 288(%rsp), %xmm10 - paddd 304(%rsp), %xmm11 - paddd 320(%rsp), %xmm12 - paddd 336(%rsp), %xmm13 - paddd 352(%rsp), %xmm14 - paddd 368(%rsp), %xmm15 - movdqa %xmm8, 384(%rsp) - movdqa %xmm9, 400(%rsp) - movdqa %xmm10, 416(%rsp) - movdqa %xmm11, 432(%rsp) - movdqa %xmm12, 448(%rsp) - movdqa %xmm13, 464(%rsp) - movdqa %xmm14, 480(%rsp) - movdqa %xmm15, 496(%rsp) - movdqa %xmm0, %xmm8 - movdqa %xmm2, %xmm9 - movdqa %xmm4, %xmm10 - movdqa %xmm6, %xmm11 - punpckhdq %xmm1, %xmm0 - punpckhdq %xmm3, %xmm2 - punpckhdq %xmm5, %xmm4 - punpckhdq %xmm7, %xmm6 - punpckldq %xmm1, %xmm8 - punpckldq %xmm3, %xmm9 - punpckldq %xmm5, %xmm10 - punpckldq %xmm7, %xmm11 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm3 - movdqa %xmm8, %xmm5 - movdqa %xmm10, %xmm7 - punpckhqdq %xmm2, %xmm0 - punpckhqdq %xmm6, %xmm4 - punpckhqdq %xmm9, %xmm8 - punpckhqdq %xmm11, %xmm10 - punpcklqdq %xmm2, %xmm1 - punpcklqdq %xmm6, %xmm3 - punpcklqdq %xmm9, %xmm5 - punpcklqdq %xmm11, %xmm7 - andq %rsi, %rsi - jz .Lchacha_blocks_ssse3_noinput1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm6 - movdqu 64(%rsi), %xmm9 - movdqu 80(%rsi), %xmm11 - movdqu 128(%rsi), %xmm12 - movdqu 144(%rsi), %xmm13 - movdqu 192(%rsi), %xmm14 - movdqu 208(%rsi), %xmm15 - pxor %xmm2, %xmm5 - pxor %xmm6, %xmm7 - pxor %xmm9, %xmm8 - pxor %xmm11, %xmm10 - pxor %xmm12, %xmm1 - pxor %xmm13, %xmm3 - pxor %xmm14, %xmm0 - pxor %xmm15, %xmm4 - movdqu %xmm5, 0(%rdx) - movdqu %xmm7, 16(%rdx) - movdqu %xmm8, 64(%rdx) - movdqu %xmm10, 80(%rdx) - movdqu %xmm1, 128(%rdx) - movdqu %xmm3, 144(%rdx) - movdqu %xmm0, 192(%rdx) - movdqu %xmm4, 208(%rdx) - movdqa 384(%rsp), %xmm0 - movdqa 400(%rsp), %xmm1 - movdqa 416(%rsp), %xmm2 - movdqa 432(%rsp), %xmm3 - movdqa 448(%rsp), %xmm4 - movdqa 464(%rsp), %xmm5 - movdqa 480(%rsp), %xmm6 - movdqa 496(%rsp), %xmm7 - movdqa %xmm0, %xmm8 - movdqa %xmm2, %xmm9 - movdqa %xmm4, %xmm10 - movdqa %xmm6, %xmm11 - punpckldq %xmm1, %xmm8 - punpckldq %xmm3, %xmm9 - punpckhdq %xmm1, %xmm0 - punpckhdq %xmm3, %xmm2 - punpckldq %xmm5, %xmm10 - punpckldq %xmm7, %xmm11 - punpckhdq %xmm5, %xmm4 - punpckhdq %xmm7, %xmm6 - movdqa %xmm8, %xmm1 - movdqa %xmm0, %xmm3 - movdqa %xmm10, %xmm5 - movdqa %xmm4, %xmm7 - punpcklqdq %xmm9, %xmm1 - punpcklqdq %xmm11, %xmm5 - punpckhqdq %xmm9, %xmm8 - punpckhqdq %xmm11, %xmm10 - punpcklqdq %xmm2, %xmm3 - punpcklqdq %xmm6, %xmm7 - punpckhqdq %xmm2, %xmm0 - punpckhqdq %xmm6, %xmm4 - movdqu 32(%rsi), %xmm2 - movdqu 48(%rsi), %xmm6 - movdqu 96(%rsi), %xmm9 - movdqu 112(%rsi), %xmm11 - movdqu 160(%rsi), %xmm12 - movdqu 176(%rsi), %xmm13 - movdqu 224(%rsi), %xmm14 - movdqu 240(%rsi), %xmm15 - pxor %xmm2, %xmm1 - pxor %xmm6, %xmm5 - pxor %xmm9, %xmm8 - pxor %xmm11, %xmm10 - pxor %xmm12, %xmm3 - pxor %xmm13, %xmm7 - pxor %xmm14, %xmm0 - pxor %xmm15, %xmm4 - movdqu %xmm1, 32(%rdx) - movdqu %xmm5, 48(%rdx) - movdqu %xmm8, 96(%rdx) - movdqu %xmm10, 112(%rdx) - movdqu %xmm3, 160(%rdx) - movdqu %xmm7, 176(%rdx) - movdqu %xmm0, 224(%rdx) - movdqu %xmm4, 240(%rdx) - addq $256, %rsi - jmp .Lchacha_blocks_ssse3_mainloop_cont -.Lchacha_blocks_ssse3_noinput1: - movdqu %xmm5, 0(%rdx) - movdqu %xmm7, 16(%rdx) - movdqu %xmm8, 64(%rdx) - movdqu %xmm10, 80(%rdx) - movdqu %xmm1, 128(%rdx) - movdqu %xmm3, 144(%rdx) - movdqu %xmm0, 192(%rdx) - movdqu %xmm4, 208(%rdx) - movdqa 384(%rsp), %xmm0 - movdqa 400(%rsp), %xmm1 - movdqa 416(%rsp), %xmm2 - movdqa 432(%rsp), %xmm3 - movdqa 448(%rsp), %xmm4 - movdqa 464(%rsp), %xmm5 - movdqa 480(%rsp), %xmm6 - movdqa 496(%rsp), %xmm7 - movdqa %xmm0, %xmm8 - movdqa %xmm2, %xmm9 - movdqa %xmm4, %xmm10 - movdqa %xmm6, %xmm11 - punpckldq %xmm1, %xmm8 - punpckldq %xmm3, %xmm9 - punpckhdq %xmm1, %xmm0 - punpckhdq %xmm3, %xmm2 - punpckldq %xmm5, %xmm10 - punpckldq %xmm7, %xmm11 - punpckhdq %xmm5, %xmm4 - punpckhdq %xmm7, %xmm6 - movdqa %xmm8, %xmm1 - movdqa %xmm0, %xmm3 - movdqa %xmm10, %xmm5 - movdqa %xmm4, %xmm7 - punpcklqdq %xmm9, %xmm1 - punpcklqdq %xmm11, %xmm5 - punpckhqdq %xmm9, %xmm8 - punpckhqdq %xmm11, %xmm10 - punpcklqdq %xmm2, %xmm3 - punpcklqdq %xmm6, %xmm7 - punpckhqdq %xmm2, %xmm0 - punpckhqdq %xmm6, %xmm4 - movdqu %xmm1, 32(%rdx) - movdqu %xmm5, 48(%rdx) - movdqu %xmm8, 96(%rdx) - movdqu %xmm10, 112(%rdx) - movdqu %xmm3, 160(%rdx) - movdqu %xmm7, 176(%rdx) - movdqu %xmm0, 224(%rdx) - movdqu %xmm4, 240(%rdx) -.Lchacha_blocks_ssse3_mainloop_cont: - addq $256, %rdx - subq $256, %rcx - cmp $256, %rcx - jae .Lchacha_blocks_ssse3_atleast256 - movdqa 80(%rsp), %xmm6 - movdqa 96(%rsp), %xmm7 - movdqa 0(%rsp), %xmm8 - movdqa 16(%rsp), %xmm9 - movdqa 32(%rsp), %xmm10 - movdqa 48(%rsp), %xmm11 - movq $1, %r9 -.Lchacha_blocks_ssse3_below256: - movq %r9, %xmm5 - andq %rcx, %rcx - jz .Lchacha_blocks_ssse3_done - cmpq $64, %rcx - jae .Lchacha_blocks_ssse3_above63 - movq %rdx, %r9 - andq %rsi, %rsi - jz .Lchacha_blocks_ssse3_noinput2 - movq %rcx, %r10 - movq %rsp, %rdx - addq %r10, %rsi - addq %r10, %rdx - negq %r10 -.Lchacha_blocks_ssse3_copyinput: - movb (%rsi, %r10), %al - movb %al, (%rdx, %r10) - incq %r10 - jnz .Lchacha_blocks_ssse3_copyinput - movq %rsp, %rsi -.Lchacha_blocks_ssse3_noinput2: - movq %rsp, %rdx -.Lchacha_blocks_ssse3_above63: - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - movq 64(%rsp), %rax -.Lchacha_blocks_ssse3_mainloop2: - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - pshufb %xmm6, %xmm3 - paddd %xmm3, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm1, %xmm4 - pslld $12, %xmm4 - psrld $20, %xmm1 - pxor %xmm4, %xmm1 - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - pshufb %xmm7, %xmm3 - pshufd $0x93, %xmm0, %xmm0 - paddd %xmm3, %xmm2 - pshufd $0x4e, %xmm3, %xmm3 - pxor %xmm2, %xmm1 - pshufd $0x39, %xmm2, %xmm2 - movdqa %xmm1, %xmm4 - pslld $7, %xmm4 - psrld $25, %xmm1 - pxor %xmm4, %xmm1 - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - pshufb %xmm6, %xmm3 - paddd %xmm3, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm1, %xmm4 - pslld $12, %xmm4 - psrld $20, %xmm1 - pxor %xmm4, %xmm1 - paddd %xmm1, %xmm0 - pxor %xmm0, %xmm3 - pshufb %xmm7, %xmm3 - pshufd $0x39, %xmm0, %xmm0 - paddd %xmm3, %xmm2 - pshufd $0x4e, %xmm3, %xmm3 - pxor %xmm2, %xmm1 - pshufd $0x93, %xmm2, %xmm2 - movdqa %xmm1, %xmm4 - pslld $7, %xmm4 - psrld $25, %xmm1 - pxor %xmm4, %xmm1 - subq $2, %rax - jnz .Lchacha_blocks_ssse3_mainloop2 - paddd %xmm8, %xmm0 - paddd %xmm9, %xmm1 - paddd %xmm10, %xmm2 - paddd %xmm11, %xmm3 - andq %rsi, %rsi - jz .Lchacha_blocks_ssse3_noinput3 - movdqu 0(%rsi), %xmm12 - movdqu 16(%rsi), %xmm13 - movdqu 32(%rsi), %xmm14 - movdqu 48(%rsi), %xmm15 - pxor %xmm12, %xmm0 - pxor %xmm13, %xmm1 - pxor %xmm14, %xmm2 - pxor %xmm15, %xmm3 - addq $64, %rsi -.Lchacha_blocks_ssse3_noinput3: - movdqu %xmm0, 0(%rdx) - movdqu %xmm1, 16(%rdx) - movdqu %xmm2, 32(%rdx) - movdqu %xmm3, 48(%rdx) - paddq %xmm5, %xmm11 - cmpq $64, %rcx - jbe .Lchacha_blocks_ssse3_mainloop2_finishup - addq $64, %rdx - subq $64, %rcx - jmp .Lchacha_blocks_ssse3_below256 -.Lchacha_blocks_ssse3_mainloop2_finishup: - cmpq $64, %rcx - je .Lchacha_blocks_ssse3_done - addq %rcx, %r9 - addq %rcx, %rdx - negq %rcx -.Lchacha_blocks_ssse3_copyoutput: - movb (%rdx, %rcx), %al - movb %al, (%r9, %rcx) - incq %rcx - jnz .Lchacha_blocks_ssse3_copyoutput -.Lchacha_blocks_ssse3_done: - movdqu %xmm11, 48(%rdi) - movq %rbp, %rsp - pxor %xmm15, %xmm15 - pxor %xmm7, %xmm7 - pxor %xmm14, %xmm14 - pxor %xmm6, %xmm6 - pxor %xmm13, %xmm13 - pxor %xmm5, %xmm5 - pxor %xmm12, %xmm12 - pxor %xmm4, %xmm4 - popq %rbp - popq %rbx - movl $(63 + 512 + 16), %eax - pxor %xmm11, %xmm11 - pxor %xmm3, %xmm3 - pxor %xmm10, %xmm10 - pxor %xmm2, %xmm2 - pxor %xmm9, %xmm9 - pxor %xmm1, %xmm1 - pxor %xmm8, %xmm8 - pxor %xmm0, %xmm0 - ret -ELF(.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;) - -.align 16; -.LC: -.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ -.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ - -#endif /*defined(USE_CHACHA20)*/ -#endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 613fa82a..ac6cc29e 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -1,637 +1,583 @@ /* chacha20.c - Bernstein's ChaCha20 cipher - * Copyright (C) 2014 Jussi Kivilinna + * Copyright (C) 2014,2017,2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * For a description of the algorithm, see: * http://cr.yp.to/chacha.html */ -/* The code is based on salsa20.c and public-domain ChaCha implementations: - * chacha-ref.c version 20080118 - * D. J. Bernstein - * Public domain. - * and - * Andrew Moon - * https://github.com/floodyberry/chacha-opt +/* + * Based on D. J. Bernstein reference implementation at + * http://cr.yp.to/chacha.html: + * + * chacha-regs.c version 20080118 + * D. J. Bernstein + * Public domain. */ - #include #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ #define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ #define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ #define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_CTR_SIZE 16 /* Bytes. */ -#define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4) -/* USE_SSE2 indicates whether to compile with Intel SSE2 code. */ -#undef USE_SSE2 -#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) -# define USE_SSE2 1 -#endif /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 -#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ - defined(HAVE_GCC_INLINE_ASM_SSSE3) +#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ #undef USE_AVX2 -#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ - defined(ENABLE_AVX2_SUPPORT) +#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX2 1 #endif -/* USE_NEON indicates whether to enable ARM NEON assembly code. */ -#undef USE_NEON +/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */ +#undef USE_ARMV7_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) -# define USE_NEON 1 +# define USE_ARMV7_NEON 1 # endif -#endif /*ENABLE_NEON_SUPPORT*/ - - -struct CHACHA20_context_s; - +#endif /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK -#if (defined(USE_SSE2) || defined(USE_SSSE3) || defined(USE_AVX2)) && \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) +#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) -# define ASM_EXTRA_STACK (10 * 16) #else # define ASM_FUNC_ABI -# define ASM_EXTRA_STACK 0 #endif -typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src, - byte *dst, - size_t bytes) ASM_FUNC_ABI; - typedef struct CHACHA20_context_s { - u32 input[CHACHA20_INPUT_LENGTH]; - u32 pad[CHACHA20_INPUT_LENGTH]; - chacha20_blocks_t blocks; + u32 input[16]; + unsigned char pad[CHACHA20_BLOCK_SIZE]; unsigned int unused; /* bytes in the pad. */ + int use_ssse3:1; + int use_avx2:1; + int use_neon:1; } CHACHA20_context_t; -#ifdef USE_SSE2 - -unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in, - byte *out, - size_t bytes) ASM_FUNC_ABI; - -#endif /* USE_SSE2 */ - #ifdef USE_SSSE3 -unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in, - byte *out, - size_t bytes) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst, + const byte *src, + size_t nblks) ASM_FUNC_ABI; #endif /* USE_SSSE3 */ #ifdef USE_AVX2 -unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in, - byte *out, - size_t bytes) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst, + const byte *src, + size_t nblks) ASM_FUNC_ABI; #endif /* USE_AVX2 */ -#ifdef USE_NEON +#ifdef USE_ARMV7_NEON -unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in, - byte *out, - size_t bytes) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst, + const byte *src, + size_t nblks); -#endif /* USE_NEON */ +#endif /* USE_ARMV7_NEON */ -static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); static const char *selftest (void); +#define ROTATE(v,c) (rol(v,c)) +#define XOR(v,w) ((v) ^ (w)) +#define PLUS(v,w) ((u32)((v) + (w))) +#define PLUSONE(v) (PLUS((v),1)) -#define QROUND(a,b,c,d) \ - do { \ - a += b; d = rol(d ^ a, 16); \ - c += d; b = rol(b ^ c, 12); \ - a += b; d = rol(d ^ a, 8); \ - c += d; b = rol(b ^ c, 7); \ - } while (0) +#define QUARTERROUND(a,b,c,d) \ + a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ + c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ + a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ + c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); -#define QOUT(ai, bi, ci, di) \ - DO_OUT(ai); DO_OUT(bi); DO_OUT(ci); DO_OUT(di) +#define BUF_XOR_LE32(dst, src, offset, x) \ + buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x)) - -#ifndef USE_SSE2 -ASM_FUNC_ABI static unsigned int -chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes) +static unsigned int +chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) { - u32 pad[CHACHA20_INPUT_LENGTH]; - u32 inp[CHACHA20_INPUT_LENGTH]; + u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; unsigned int i; - /* Note: 'bytes' must be multiple of 64 and not zero. */ - - inp[0] = state[0]; - inp[1] = state[1]; - inp[2] = state[2]; - inp[3] = state[3]; - inp[4] = state[4]; - inp[5] = state[5]; - inp[6] = state[6]; - inp[7] = state[7]; - inp[8] = state[8]; - inp[9] = state[9]; - inp[10] = state[10]; - inp[11] = state[11]; - inp[12] = state[12]; - inp[13] = state[13]; - inp[14] = state[14]; - inp[15] = state[15]; - - do + while (nblks) { - /* First round. */ - pad[0] = inp[0]; - pad[4] = inp[4]; - pad[8] = inp[8]; - pad[12] = inp[12]; - QROUND (pad[0], pad[4], pad[8], pad[12]); - pad[1] = inp[1]; - pad[5] = inp[5]; - pad[9] = inp[9]; - pad[13] = inp[13]; - QROUND (pad[1], pad[5], pad[9], pad[13]); - pad[2] = inp[2]; - pad[6] = inp[6]; - pad[10] = inp[10]; - pad[14] = inp[14]; - QROUND (pad[2], pad[6], pad[10], pad[14]); - pad[3] = inp[3]; - pad[7] = inp[7]; - pad[11] = inp[11]; - pad[15] = inp[15]; - QROUND (pad[3], pad[7], pad[11], pad[15]); - - QROUND (pad[0], pad[5], pad[10], pad[15]); - QROUND (pad[1], pad[6], pad[11], pad[12]); - QROUND (pad[2], pad[7], pad[8], pad[13]); - QROUND (pad[3], pad[4], pad[9], pad[14]); - - for (i = 2; i < 20 - 2; i += 2) - { - QROUND (pad[0], pad[4], pad[8], pad[12]); - QROUND (pad[1], pad[5], pad[9], pad[13]); - QROUND (pad[2], pad[6], pad[10], pad[14]); - QROUND (pad[3], pad[7], pad[11], pad[15]); - - QROUND (pad[0], pad[5], pad[10], pad[15]); - QROUND (pad[1], pad[6], pad[11], pad[12]); - QROUND (pad[2], pad[7], pad[8], pad[13]); - QROUND (pad[3], pad[4], pad[9], pad[14]); - } - - QROUND (pad[0], pad[4], pad[8], pad[12]); - QROUND (pad[1], pad[5], pad[9], pad[13]); - QROUND (pad[2], pad[6], pad[10], pad[14]); - QROUND (pad[3], pad[7], pad[11], pad[15]); - - if (src) - { -#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, \ - (pad[idx] + inp[idx]) ^ \ - buf_get_le32(src + (idx) * 4)) - /* Last round. */ - QROUND (pad[0], pad[5], pad[10], pad[15]); - QOUT(0, 5, 10, 15); - QROUND (pad[1], pad[6], pad[11], pad[12]); - QOUT(1, 6, 11, 12); - QROUND (pad[2], pad[7], pad[8], pad[13]); - QOUT(2, 7, 8, 13); - QROUND (pad[3], pad[4], pad[9], pad[14]); - QOUT(3, 4, 9, 14); -#undef DO_OUT - } - else - { -#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, pad[idx] + inp[idx]) - /* Last round. */ - QROUND (pad[0], pad[5], pad[10], pad[15]); - QOUT(0, 5, 10, 15); - QROUND (pad[1], pad[6], pad[11], pad[12]); - QOUT(1, 6, 11, 12); - QROUND (pad[2], pad[7], pad[8], pad[13]); - QOUT(2, 7, 8, 13); - QROUND (pad[3], pad[4], pad[9], pad[14]); - QOUT(3, 4, 9, 14); -#undef DO_OUT - } - - /* Update counter. */ - inp[13] += (!++inp[12]); - - bytes -= CHACHA20_BLOCK_SIZE; + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + x4 = input[4]; + x5 = input[5]; + x6 = input[6]; + x7 = input[7]; + x8 = input[8]; + x9 = input[9]; + x10 = input[10]; + x11 = input[11]; + x12 = input[12]; + x13 = input[13]; + x14 = input[14]; + x15 = input[15]; + + for (i = 20; i > 0; i -= 2) + { + QUARTERROUND(x0, x4, x8, x12) + QUARTERROUND(x1, x5, x9, x13) + QUARTERROUND(x2, x6, x10, x14) + QUARTERROUND(x3, x7, x11, x15) + QUARTERROUND(x0, x5, x10, x15) + QUARTERROUND(x1, x6, x11, x12) + QUARTERROUND(x2, x7, x8, x13) + QUARTERROUND(x3, x4, x9, x14) + } + + x0 = PLUS(x0, input[0]); + x1 = PLUS(x1, input[1]); + x2 = PLUS(x2, input[2]); + x3 = PLUS(x3, input[3]); + x4 = PLUS(x4, input[4]); + x5 = PLUS(x5, input[5]); + x6 = PLUS(x6, input[6]); + x7 = PLUS(x7, input[7]); + x8 = PLUS(x8, input[8]); + x9 = PLUS(x9, input[9]); + x10 = PLUS(x10, input[10]); + x11 = PLUS(x11, input[11]); + x12 = PLUS(x12, input[12]); + x13 = PLUS(x13, input[13]); + x14 = PLUS(x14, input[14]); + x15 = PLUS(x15, input[15]); + + input[12] = PLUSONE(input[12]); + input[13] = PLUS(input[13], !input[12]); + + BUF_XOR_LE32(dst, src, 0, x0); + BUF_XOR_LE32(dst, src, 4, x1); + BUF_XOR_LE32(dst, src, 8, x2); + BUF_XOR_LE32(dst, src, 12, x3); + BUF_XOR_LE32(dst, src, 16, x4); + BUF_XOR_LE32(dst, src, 20, x5); + BUF_XOR_LE32(dst, src, 24, x6); + BUF_XOR_LE32(dst, src, 28, x7); + BUF_XOR_LE32(dst, src, 32, x8); + BUF_XOR_LE32(dst, src, 36, x9); + BUF_XOR_LE32(dst, src, 40, x10); + BUF_XOR_LE32(dst, src, 44, x11); + BUF_XOR_LE32(dst, src, 48, x12); + BUF_XOR_LE32(dst, src, 52, x13); + BUF_XOR_LE32(dst, src, 56, x14); + BUF_XOR_LE32(dst, src, 60, x15); + + src += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; - src += (src) ? CHACHA20_BLOCK_SIZE : 0; + nblks--; } - while (bytes >= CHACHA20_BLOCK_SIZE); - - state[12] = inp[12]; - state[13] = inp[13]; /* burn_stack */ - return (2 * CHACHA20_INPUT_LENGTH * sizeof(u32) + 6 * sizeof(void *)); -} -#endif /*!USE_SSE2*/ - -#undef QROUND -#undef QOUT - - -static unsigned int -chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx) -{ - return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE) - + ASM_EXTRA_STACK; + return (17 * sizeof(u32) + 6 * sizeof(void *)); } static void -chacha20_keysetup (CHACHA20_context_t * ctx, const byte * key, +chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { - /* These constants are the little endian encoding of the string - "expand 32-byte k". For the 128 bit variant, the "32" in that - string will be fixed up to "16". */ - ctx->input[0] = 0x61707865; /* "apxe" */ - ctx->input[1] = 0x3320646e; /* "3 dn" */ - ctx->input[2] = 0x79622d32; /* "yb-2" */ - ctx->input[3] = 0x6b206574; /* "k et" */ - - ctx->input[4] = buf_get_le32 (key + 0); - ctx->input[5] = buf_get_le32 (key + 4); - ctx->input[6] = buf_get_le32 (key + 8); - ctx->input[7] = buf_get_le32 (key + 12); - + static const char sigma[16] = "expand 32-byte k"; + static const char tau[16] = "expand 16-byte k"; + const char *constants; + + ctx->input[4] = buf_get_le32(key + 0); + ctx->input[5] = buf_get_le32(key + 4); + ctx->input[6] = buf_get_le32(key + 8); + ctx->input[7] = buf_get_le32(key + 12); if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */ { - ctx->input[8] = buf_get_le32 (key + 16); - ctx->input[9] = buf_get_le32 (key + 20); - ctx->input[10] = buf_get_le32 (key + 24); - ctx->input[11] = buf_get_le32 (key + 28); + key += 16; + constants = sigma; } else /* 128 bits */ { - ctx->input[8] = ctx->input[4]; - ctx->input[9] = ctx->input[5]; - ctx->input[10] = ctx->input[6]; - ctx->input[11] = ctx->input[7]; - - ctx->input[1] -= 0x02000000; /* Change to "1 dn". */ - ctx->input[2] += 0x00000004; /* Change to "yb-6". */ + constants = tau; } + ctx->input[8] = buf_get_le32(key + 0); + ctx->input[9] = buf_get_le32(key + 4); + ctx->input[10] = buf_get_le32(key + 8); + ctx->input[11] = buf_get_le32(key + 12); + ctx->input[0] = buf_get_le32(constants + 0); + ctx->input[1] = buf_get_le32(constants + 4); + ctx->input[2] = buf_get_le32(constants + 8); + ctx->input[3] = buf_get_le32(constants + 12); } static void -chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen) +chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen) { if (ivlen == CHACHA20_CTR_SIZE) { ctx->input[12] = buf_get_le32 (iv + 0); ctx->input[13] = buf_get_le32 (iv + 4); ctx->input[14] = buf_get_le32 (iv + 8); ctx->input[15] = buf_get_le32 (iv + 12); } else if (ivlen == CHACHA20_MAX_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = buf_get_le32 (iv + 0); ctx->input[14] = buf_get_le32 (iv + 4); ctx->input[15] = buf_get_le32 (iv + 8); } else if (ivlen == CHACHA20_MIN_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = buf_get_le32 (iv + 0); ctx->input[15] = buf_get_le32 (iv + 4); } else { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = 0; ctx->input[15] = 0; } } +static void +chacha20_setiv (void *context, const byte *iv, size_t ivlen) +{ + CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; + + /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ + if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE + && ivlen != CHACHA20_CTR_SIZE) + log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); + + if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE + || ivlen == CHACHA20_CTR_SIZE)) + chacha20_ivsetup (ctx, iv, ivlen); + else + chacha20_ivsetup (ctx, NULL, 0); + + /* Reset the unused pad bytes counter. */ + ctx->unused = 0; +} + + static gcry_err_code_t -chacha20_do_setkey (CHACHA20_context_t * ctx, - const byte * key, unsigned int keylen) +chacha20_do_setkey (CHACHA20_context_t *ctx, + const byte *key, unsigned int keylen) { static int initialized; static const char *selftest_failed; unsigned int features = _gcry_get_hw_features (); if (!initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) return GPG_ERR_INV_KEYLEN; -#ifdef USE_SSE2 - ctx->blocks = _gcry_chacha20_amd64_sse2_blocks; -#else - ctx->blocks = chacha20_blocks; -#endif - #ifdef USE_SSSE3 - if (features & HWF_INTEL_SSSE3) - ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks; + ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif #ifdef USE_AVX2 - if (features & HWF_INTEL_AVX2) - ctx->blocks = _gcry_chacha20_amd64_avx2_blocks; + ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0; #endif -#ifdef USE_NEON - if (features & HWF_ARM_NEON) - ctx->blocks = _gcry_chacha20_armv7_neon_blocks; +#ifdef USE_ARMV7_NEON + ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif - (void)features; chacha20_keysetup (ctx, key, keylen); /* We default to a zero nonce. */ chacha20_setiv (ctx, NULL, 0); return 0; } static gcry_err_code_t -chacha20_setkey (void *context, const byte * key, unsigned int keylen) +chacha20_setkey (void *context, const byte *key, unsigned int keylen) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen); _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); return rc; } static void -chacha20_setiv (void *context, const byte * iv, size_t ivlen) +chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, + size_t length) { + static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, }; CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; - - /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ - if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE - && ivlen != CHACHA20_CTR_SIZE) - log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); - - if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE - || ivlen == CHACHA20_CTR_SIZE)) - chacha20_ivsetup (ctx, iv, ivlen); - else - chacha20_ivsetup (ctx, NULL, 0); - - /* Reset the unused pad bytes counter. */ - ctx->unused = 0; -} - - - -/* Note: This function requires LENGTH > 0. */ -static void -chacha20_do_encrypt_stream (CHACHA20_context_t * ctx, - byte * outbuf, const byte * inbuf, size_t length) -{ unsigned int nburn, burn = 0; + if (!length) + return; + if (ctx->unused) { - unsigned char *p = (void *) ctx->pad; + unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; + buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; + if (!length) return; gcry_assert (!ctx->unused); } +#ifdef USE_AVX2 + if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 8; + nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, + nblocks); + burn = nburn > burn ? nburn : burn; + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + +#ifdef USE_SSSE3 + if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; + nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, + nblocks); + burn = nburn > burn ? nburn : burn; + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + +#ifdef USE_ARMV7_NEON + if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; + nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf, + nblocks); + burn = nburn > burn ? nburn : burn; + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; - size_t bytes = nblocks * CHACHA20_BLOCK_SIZE; - burn = ctx->blocks(ctx->input, inbuf, outbuf, bytes); - length -= bytes; - outbuf += bytes; - inbuf += bytes; + nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks); + burn = nburn > burn ? nburn : burn; + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length > 0) { - nburn = chacha20_core (ctx->pad, ctx); + nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, ctx->pad, length); ctx->unused = CHACHA20_BLOCK_SIZE - length; } _gcry_burn_stack (burn); } -static void -chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf, - size_t length) -{ - CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; - - if (length) - chacha20_do_encrypt_stream (ctx, outbuf, inbuf, length); -} - - static const char * selftest (void) { byte ctxbuf[sizeof(CHACHA20_context_t) + 15]; CHACHA20_context_t *ctx; byte scratch[127 + 1]; byte buf[512 + 64 + 4]; int i; /* From draft-strombergson-chacha-test-vectors */ static byte key_1[] = { 0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78, 0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35, 0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb, 0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d }; static const byte nonce_1[] = { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 }; static const byte plaintext_1[127] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const byte ciphertext_1[127] = { 0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9, 0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06, 0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00, 0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf, 0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd, 0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f, 0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f, 0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92, 0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9, 0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36, 0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1, 0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38, 0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea, 0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0, 0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27, 0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33 }; /* 16-byte alignment required for amd64 implementation. */ ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15); chacha20_setkey (ctx, key_1, sizeof key_1); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); scratch[sizeof (scratch) - 1] = 0; chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1); if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) return "ChaCha20 encryption test 1 failed."; if (scratch[sizeof (scratch) - 1]) return "ChaCha20 wrote too much."; chacha20_setkey (ctx, key_1, sizeof (key_1)); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1); if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) return "ChaCha20 decryption test 1 failed."; for (i = 0; i < sizeof buf; i++) buf[i] = i; chacha20_setkey (ctx, key_1, sizeof key_1); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /*encrypt */ chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); /*decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, 1); chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1); chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1, buf + (sizeof buf) - 1, 1); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 2 failed."; chacha20_setkey (ctx, key_1, sizeof key_1); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /* encrypt */ for (i = 0; i < sizeof buf; i++) chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1); /* decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 3 failed."; return NULL; } gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = { GCRY_CIPHER_CHACHA20, {0, 0}, /* flags */ "CHACHA20", /* name */ NULL, /* aliases */ NULL, /* oids */ 1, /* blocksize in bytes. */ CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */ sizeof (CHACHA20_context_t), chacha20_setkey, NULL, NULL, chacha20_encrypt_stream, chacha20_encrypt_stream, NULL, NULL, chacha20_setiv }; diff --git a/configure.ac b/configure.ac index c4b59f4d..a5aba144 100644 --- a/configure.ac +++ b/configure.ac @@ -1,2652 +1,2651 @@ # Configure.ac script for Libgcrypt # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, # 2007, 2008, 2009, 2011 Free Software Foundation, Inc. # Copyright (C) 2012-2017 g10 Code GmbH # # This file is part of Libgcrypt. # # Libgcrypt is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of # the License, or (at your option) any later version. # # Libgcrypt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, see . # (Process this file with autoconf to produce a configure script.) AC_REVISION($Revision$) AC_PREREQ(2.60) min_automake_version="1.14" # To build a release you need to create a tag with the version number # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force". Please # bump the version number immediately after the release and do another # commit and push so that the git magic is able to work. See below # for the LT versions. m4_define(mym4_version_major, [1]) m4_define(mym4_version_minor, [9]) m4_define(mym4_version_micro, [0]) # Below is m4 magic to extract and compute the revision number, the # decimalized short revision number, a beta version string, and a flag # indicating a development version (mym4_isgit). Note that the m4 # processing is done by autoconf and not during the configure run. m4_define(mym4_version, [mym4_version_major.mym4_version_minor.mym4_version_micro]) m4_define([mym4_revision], m4_esyscmd([git rev-parse --short HEAD | tr -d '\n\r'])) m4_define([mym4_revision_dec], m4_esyscmd_s([echo $((0x$(echo ]mym4_revision[|head -c 4)))])) m4_define([mym4_betastring], m4_esyscmd_s([git describe --match 'libgcrypt-[0-9].*[0-9]' --long|\ awk -F- '$3!=0{print"-beta"$3}'])) m4_define([mym4_isgit],m4_if(mym4_betastring,[],[no],[yes])) m4_define([mym4_full_version],[mym4_version[]mym4_betastring]) AC_INIT([libgcrypt],[mym4_full_version],[http://bugs.gnupg.org]) # LT Version numbers, remember to change them just *before* a release. # (Interfaces removed: CURRENT++, AGE=0, REVISION=0) # (Interfaces added: CURRENT++, AGE++, REVISION=0) # (No interfaces changed: REVISION++) LIBGCRYPT_LT_CURRENT=23 LIBGCRYPT_LT_AGE=3 LIBGCRYPT_LT_REVISION=0 # If the API is changed in an incompatible way: increment the next counter. # # 1.6: ABI and API change but the change is to most users irrelevant # and thus the API version number has not been incremented. LIBGCRYPT_CONFIG_API_VERSION=1 # If you change the required gpg-error version, please remove # unnecessary error code defines in src/gcrypt-int.h. NEED_GPG_ERROR_VERSION=1.25 PACKAGE=$PACKAGE_NAME VERSION=$PACKAGE_VERSION AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_SRCDIR([src/libgcrypt.vers]) AM_INIT_AUTOMAKE([serial-tests dist-bzip2]) AC_CONFIG_HEADER(config.h) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_LIBOBJ_DIR([compat]) AC_CANONICAL_HOST AM_MAINTAINER_MODE AM_SILENT_RULES AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory]) AH_TOP([ #ifndef _GCRYPT_CONFIG_H_INCLUDED #define _GCRYPT_CONFIG_H_INCLUDED /* Enable gpg-error's strerror macro for W32CE. */ #define GPG_ERR_ENABLE_ERRNO_MACROS 1 ]) AH_BOTTOM([ #define _GCRYPT_IN_LIBGCRYPT 1 /* If the configure check for endianness has been disabled, get it from OS macros. This is intended for making fat binary builds on OS X. */ #ifdef DISABLED_ENDIAN_CHECK # if defined(__BIG_ENDIAN__) # define WORDS_BIGENDIAN 1 # elif defined(__LITTLE_ENDIAN__) # undef WORDS_BIGENDIAN # else # error "No endianness found" # endif #endif /*DISABLED_ENDIAN_CHECK*/ /* We basically use the original Camellia source. Make sure the symbols properly prefixed. */ #define CAMELLIA_EXT_SYM_PREFIX _gcry_ #endif /*_GCRYPT_CONFIG_H_INCLUDED*/ ]) AH_VERBATIM([_REENTRANT], [/* To allow the use of Libgcrypt in multithreaded programs we have to use special features from the library. */ #ifndef _REENTRANT # define _REENTRANT 1 #endif ]) AC_SUBST(LIBGCRYPT_LT_CURRENT) AC_SUBST(LIBGCRYPT_LT_AGE) AC_SUBST(LIBGCRYPT_LT_REVISION) AC_SUBST(PACKAGE) AC_SUBST(VERSION) AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of this package]) AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version of this package]) VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_version_major \ mym4_version_minor mym4_version_micro) AC_SUBST(VERSION_NUMBER) ###################### ## Basic checks. ### (we need some results later on (e.g. $GCC) ###################### AC_PROG_MAKE_SET missing_dir=`cd $ac_aux_dir && pwd` AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir) AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir) AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir) AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir) # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir) AC_PROG_CC AC_PROG_CPP AM_PROG_CC_C_O AM_PROG_AS AC_ISC_POSIX AC_PROG_INSTALL AC_PROG_AWK AC_GNU_SOURCE # We need to compile and run a program on the build machine. A # comment in libgpg-error says that the AC_PROG_CC_FOR_BUILD macro in # the AC archive is broken for autoconf 2.57. Given that there is no # newer version of that macro, we assume that it is also broken for # autoconf 2.61 and thus we use a simple but usually sufficient # approach. AC_MSG_CHECKING(for cc for build) if test "$cross_compiling" = "yes"; then CC_FOR_BUILD="${CC_FOR_BUILD-cc}" else CC_FOR_BUILD="${CC_FOR_BUILD-$CC}" fi AC_MSG_RESULT($CC_FOR_BUILD) AC_ARG_VAR(CC_FOR_BUILD,[build system C compiler]) LT_PREREQ([2.2.6]) LT_INIT([win32-dll disable-static]) LT_LANG([Windows Resource]) ########################## ## General definitions. ## ########################## # Used by libgcrypt-config LIBGCRYPT_CONFIG_LIBS="-lgcrypt" LIBGCRYPT_CONFIG_CFLAGS="" LIBGCRYPT_CONFIG_HOST="$host" # Definitions for symmetric ciphers. available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed" available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20" enabled_ciphers="" # Definitions for public-key ciphers. available_pubkey_ciphers="dsa elgamal rsa ecc" enabled_pubkey_ciphers="" # Definitions for message digests. available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512" available_digests="$available_digests sha3 tiger whirlpool stribog blake2" available_digests="$available_digests sm3" enabled_digests="" # Definitions for kdfs (optional ones) available_kdfs="s2k pkdf2 scrypt" enabled_kdfs="" # Definitions for random modules. available_random_modules="linux egd unix" auto_random_modules="$available_random_modules" # Supported thread backends. LIBGCRYPT_THREAD_MODULES="" # Other definitions. have_w32_system=no have_w32ce_system=no have_pthread=no # Setup some stuff depending on host. case "${host}" in *-*-mingw32*) ac_cv_have_dev_random=no have_w32_system=yes case "${host}" in *-mingw32ce*) have_w32ce_system=yes available_random_modules="w32ce" ;; *) available_random_modules="w32" ;; esac AC_DEFINE(USE_ONLY_8DOT3,1, [set this to limit filenames to the 8.3 format]) AC_DEFINE(HAVE_DRIVE_LETTERS,1, [defined if we must run on a stupid file system]) AC_DEFINE(HAVE_DOSISH_SYSTEM,1, [defined if we run on some of the PCDOS like systems (DOS, Windoze. OS/2) with special properties like no file modes]) ;; i?86-emx-os2 | i?86-*-os2*emx) # OS/2 with the EMX environment ac_cv_have_dev_random=no AC_DEFINE(HAVE_DRIVE_LETTERS) AC_DEFINE(HAVE_DOSISH_SYSTEM) ;; i?86-*-msdosdjgpp*) # DOS with the DJGPP environment ac_cv_have_dev_random=no AC_DEFINE(HAVE_DRIVE_LETTERS) AC_DEFINE(HAVE_DOSISH_SYSTEM) ;; *-*-hpux*) if test -z "$GCC" ; then CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE" fi ;; *-dec-osf4*) if test -z "$GCC" ; then # Suppress all warnings # to get rid of the unsigned/signed char mismatch warnings. CFLAGS="$CFLAGS -w" fi ;; m68k-atari-mint) ;; *-apple-darwin*) AC_DEFINE(_DARWIN_C_SOURCE, 900000L, Expose all libc features (__DARWIN_C_FULL).) ;; *) ;; esac if test "$have_w32_system" = yes; then AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system]) if test "$have_w32ce_system" = yes; then AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE]) fi fi AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes) AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes) # A printable OS Name is sometimes useful. case "${host}" in *-*-mingw32ce*) PRINTABLE_OS_NAME="W32CE" ;; *-*-mingw32*) PRINTABLE_OS_NAME="W32" ;; i?86-emx-os2 | i?86-*-os2*emx ) PRINTABLE_OS_NAME="OS/2" ;; i?86-*-msdosdjgpp*) PRINTABLE_OS_NAME="MSDOS/DJGPP" ;; *-linux*) PRINTABLE_OS_NAME="GNU/Linux" ;; *) PRINTABLE_OS_NAME=`uname -s || echo "Unknown"` ;; esac NAME_OF_DEV_RANDOM="/dev/random" NAME_OF_DEV_URANDOM="/dev/urandom" AC_ARG_ENABLE(endian-check, AC_HELP_STRING([--disable-endian-check], [disable the endian check and trust the OS provided macros]), endiancheck=$enableval,endiancheck=yes) if test x"$endiancheck" = xyes ; then AC_C_BIGENDIAN else AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness]) fi AC_CHECK_SIZEOF(unsigned short, 2) AC_CHECK_SIZEOF(unsigned int, 4) AC_CHECK_SIZEOF(unsigned long, 4) AC_CHECK_SIZEOF(unsigned long long, 0) AC_CHECK_SIZEOF(void *, 0) AC_TYPE_UINTPTR_T if test "$ac_cv_sizeof_unsigned_short" = "0" \ || test "$ac_cv_sizeof_unsigned_int" = "0" \ || test "$ac_cv_sizeof_unsigned_long" = "0"; then AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]); fi # Ensure that we have UINT64_C before we bother to check for uint64_t AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works], AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[uint64_t foo=UINT64_C(42);]])], gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no)) if test "$gnupg_cv_uint64_c_works" = "yes" ; then AC_CHECK_SIZEOF(uint64_t) fi # Do we have any 64-bit data types? if test "$ac_cv_sizeof_unsigned_int" != "8" \ && test "$ac_cv_sizeof_unsigned_long" != "8" \ && test "$ac_cv_sizeof_unsigned_long_long" != "8" \ && test "$ac_cv_sizeof_uint64_t" != "8"; then AC_MSG_ERROR([[ *** *** No 64-bit integer type available. *** It is not possible to build Libgcrypt on this platform. ***]]) fi # If not specified otherwise, all available algorithms will be # included. default_ciphers="$available_ciphers" default_pubkey_ciphers="$available_pubkey_ciphers" default_digests="$available_digests" default_kdfs="$available_kdfs" # Blacklist MD2 by default default_digests=`echo $default_digests | sed -e 's/md2//g'` # Substitutions to set generated files in a Emacs buffer to read-only. AC_SUBST(emacs_local_vars_begin, ['Local Variables:']) AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t']) AC_SUBST(emacs_local_vars_end, ['End:']) ############################ ## Command line switches. ## ############################ # Implementation of the --enable-ciphers switch. AC_ARG_ENABLE(ciphers, AC_HELP_STRING([--enable-ciphers=ciphers], [select the symmetric ciphers to include]), [enabled_ciphers=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_ciphers=""]) if test "x$enabled_ciphers" = "x" \ -o "$enabled_ciphers" = "yes" \ -o "$enabled_ciphers" = "no"; then enabled_ciphers=$default_ciphers fi AC_MSG_CHECKING([which symmetric ciphers to include]) for cipher in $enabled_ciphers; do LIST_MEMBER($cipher, $available_ciphers) if test "$found" = "0"; then AC_MSG_ERROR([unsupported cipher "$cipher" specified]) fi done AC_MSG_RESULT([$enabled_ciphers]) # Implementation of the --enable-pubkey-ciphers switch. AC_ARG_ENABLE(pubkey-ciphers, AC_HELP_STRING([--enable-pubkey-ciphers=ciphers], [select the public-key ciphers to include]), [enabled_pubkey_ciphers=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_pubkey_ciphers=""]) if test "x$enabled_pubkey_ciphers" = "x" \ -o "$enabled_pubkey_ciphers" = "yes" \ -o "$enabled_pubkey_ciphers" = "no"; then enabled_pubkey_ciphers=$default_pubkey_ciphers fi AC_MSG_CHECKING([which public-key ciphers to include]) for cipher in $enabled_pubkey_ciphers; do LIST_MEMBER($cipher, $available_pubkey_ciphers) if test "$found" = "0"; then AC_MSG_ERROR([unsupported public-key cipher specified]) fi done AC_MSG_RESULT([$enabled_pubkey_ciphers]) # Implementation of the --enable-digests switch. AC_ARG_ENABLE(digests, AC_HELP_STRING([--enable-digests=digests], [select the message digests to include]), [enabled_digests=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_digests=""]) if test "x$enabled_digests" = "x" \ -o "$enabled_digests" = "yes" \ -o "$enabled_digests" = "no"; then enabled_digests=$default_digests fi AC_MSG_CHECKING([which message digests to include]) for digest in $enabled_digests; do LIST_MEMBER($digest, $available_digests) if test "$found" = "0"; then AC_MSG_ERROR([unsupported message digest specified]) fi done AC_MSG_RESULT([$enabled_digests]) # Implementation of the --enable-kdfs switch. AC_ARG_ENABLE(kdfs, AC_HELP_STRING([--enable-kfds=kdfs], [select the KDFs to include]), [enabled_kdfs=`echo $enableval | tr ',:' ' ' | tr '[A-Z]' '[a-z]'`], [enabled_kdfs=""]) if test "x$enabled_kdfs" = "x" \ -o "$enabled_kdfs" = "yes" \ -o "$enabled_kdfs" = "no"; then enabled_kdfs=$default_kdfs fi AC_MSG_CHECKING([which key derivation functions to include]) for kdf in $enabled_kdfs; do LIST_MEMBER($kdf, $available_kdfs) if test "$found" = "0"; then AC_MSG_ERROR([unsupported key derivation function specified]) fi done AC_MSG_RESULT([$enabled_kdfs]) # Implementation of the --enable-random switch. AC_ARG_ENABLE(random, AC_HELP_STRING([--enable-random=name], [select which random number generator to use]), [random=`echo $enableval | tr '[A-Z]' '[a-z]'`], []) if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then random=default fi AC_MSG_CHECKING([which random module to use]) if test "$random" != "default" -a "$random" != "auto"; then LIST_MEMBER($random, $available_random_modules) if test "$found" = "0"; then AC_MSG_ERROR([unsupported random module specified]) fi fi AC_MSG_RESULT($random) # Implementation of the --disable-dev-random switch. AC_MSG_CHECKING([whether use of /dev/random is requested]) AC_ARG_ENABLE(dev-random, [ --disable-dev-random disable the use of dev random], try_dev_random=$enableval, try_dev_random=yes) AC_MSG_RESULT($try_dev_random) # Implementation of the --with-egd-socket switch. AC_ARG_WITH(egd-socket, [ --with-egd-socket=NAME Use NAME for the EGD socket)], egd_socket_name="$withval", egd_socket_name="" ) AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name", [Define if you don't want the default EGD socket name. For details see cipher/rndegd.c]) # Implementation of the --enable-random-daemon AC_MSG_CHECKING([whether the experimental random daemon is requested]) AC_ARG_ENABLE([random-daemon], AC_HELP_STRING([--enable-random-daemon], [Build and support the experimental gcryptrnd]), [use_random_daemon=$enableval], [use_random_daemon=no]) AC_MSG_RESULT($use_random_daemon) if test x$use_random_daemon = xyes ; then AC_DEFINE(USE_RANDOM_DAEMON,1, [Define to support the experimental random daemon]) fi AM_CONDITIONAL(USE_RANDOM_DAEMON, test x$use_random_daemon = xyes) # Implementation of --disable-asm. AC_MSG_CHECKING([whether MPI assembler modules are requested]) AC_ARG_ENABLE([asm], AC_HELP_STRING([--disable-asm], [Disable MPI assembler modules]), [try_asm_modules=$enableval], [try_asm_modules=yes]) AC_MSG_RESULT($try_asm_modules) # Implementation of the --enable-m-guard switch. AC_MSG_CHECKING([whether memory guard is requested]) AC_ARG_ENABLE(m-guard, AC_HELP_STRING([--enable-m-guard], [Enable memory guard facility]), [use_m_guard=$enableval], [use_m_guard=no]) AC_MSG_RESULT($use_m_guard) if test "$use_m_guard" = yes ; then AC_DEFINE(M_GUARD,1,[Define to use the (obsolete) malloc guarding feature]) fi # Implementation of the --enable-large-data-tests switch. AC_MSG_CHECKING([whether to run large data tests]) AC_ARG_ENABLE(large-data-tests, AC_HELP_STRING([--enable-large-data-tests], [Enable the real long ruinning large data tests]), large_data_tests=$enableval,large_data_tests=no) AC_MSG_RESULT($large_data_tests) AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests) # Implementation of the --with-capabilities switch. # Check whether we want to use Linux capabilities AC_MSG_CHECKING([whether use of capabilities is requested]) AC_ARG_WITH(capabilities, AC_HELP_STRING([--with-capabilities], [Use linux capabilities [default=no]]), [use_capabilities="$withval"],[use_capabilities=no]) AC_MSG_RESULT($use_capabilities) # Implementation of the --enable-hmac-binary-check. AC_MSG_CHECKING([whether a HMAC binary check is requested]) AC_ARG_ENABLE(hmac-binary-check, AC_HELP_STRING([--enable-hmac-binary-check], [Enable library integrity check]), [use_hmac_binary_check=$enableval], [use_hmac_binary_check=no]) AC_MSG_RESULT($use_hmac_binary_check) if test "$use_hmac_binary_check" = yes ; then AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1, [Define to support an HMAC based integrity check]) fi # Implementation of the --disable-jent-support switch. AC_MSG_CHECKING([whether jitter entropy support is requested]) AC_ARG_ENABLE(jent-support, AC_HELP_STRING([--disable-jent-support], [Disable support for the Jitter entropy collector]), jentsupport=$enableval,jentsupport=yes) AC_MSG_RESULT($jentsupport) # Implementation of the --disable-padlock-support switch. AC_MSG_CHECKING([whether padlock support is requested]) AC_ARG_ENABLE(padlock-support, AC_HELP_STRING([--disable-padlock-support], [Disable support for the PadLock Engine of VIA processors]), padlocksupport=$enableval,padlocksupport=yes) AC_MSG_RESULT($padlocksupport) # Implementation of the --disable-aesni-support switch. AC_MSG_CHECKING([whether AESNI support is requested]) AC_ARG_ENABLE(aesni-support, AC_HELP_STRING([--disable-aesni-support], [Disable support for the Intel AES-NI instructions]), aesnisupport=$enableval,aesnisupport=yes) AC_MSG_RESULT($aesnisupport) # Implementation of the --disable-pclmul-support switch. AC_MSG_CHECKING([whether PCLMUL support is requested]) AC_ARG_ENABLE(pclmul-support, AC_HELP_STRING([--disable-pclmul-support], [Disable support for the Intel PCLMUL instructions]), pclmulsupport=$enableval,pclmulsupport=yes) AC_MSG_RESULT($pclmulsupport) # Implementation of the --disable-sse41-support switch. AC_MSG_CHECKING([whether SSE4.1 support is requested]) AC_ARG_ENABLE(sse41-support, AC_HELP_STRING([--disable-sse41-support], [Disable support for the Intel SSE4.1 instructions]), sse41support=$enableval,sse41support=yes) AC_MSG_RESULT($sse41support) # Implementation of the --disable-drng-support switch. AC_MSG_CHECKING([whether DRNG support is requested]) AC_ARG_ENABLE(drng-support, AC_HELP_STRING([--disable-drng-support], [Disable support for the Intel DRNG (RDRAND instruction)]), drngsupport=$enableval,drngsupport=yes) AC_MSG_RESULT($drngsupport) # Implementation of the --disable-avx-support switch. AC_MSG_CHECKING([whether AVX support is requested]) AC_ARG_ENABLE(avx-support, AC_HELP_STRING([--disable-avx-support], [Disable support for the Intel AVX instructions]), avxsupport=$enableval,avxsupport=yes) AC_MSG_RESULT($avxsupport) # Implementation of the --disable-avx2-support switch. AC_MSG_CHECKING([whether AVX2 support is requested]) AC_ARG_ENABLE(avx2-support, AC_HELP_STRING([--disable-avx2-support], [Disable support for the Intel AVX2 instructions]), avx2support=$enableval,avx2support=yes) AC_MSG_RESULT($avx2support) # Implementation of the --disable-neon-support switch. AC_MSG_CHECKING([whether NEON support is requested]) AC_ARG_ENABLE(neon-support, AC_HELP_STRING([--disable-neon-support], [Disable support for the ARM NEON instructions]), neonsupport=$enableval,neonsupport=yes) AC_MSG_RESULT($neonsupport) # Implementation of the --disable-arm-crypto-support switch. AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested]) AC_ARG_ENABLE(arm-crypto-support, AC_HELP_STRING([--disable-arm-crypto-support], [Disable support for the ARMv8 Crypto Extension instructions]), armcryptosupport=$enableval,armcryptosupport=yes) AC_MSG_RESULT($armcryptosupport) # Implementation of the --disable-O-flag-munging switch. AC_MSG_CHECKING([whether a -O flag munging is requested]) AC_ARG_ENABLE([O-flag-munging], AC_HELP_STRING([--disable-O-flag-munging], [Disable modification of the cc -O flag]), [enable_o_flag_munging=$enableval], [enable_o_flag_munging=yes]) AC_MSG_RESULT($enable_o_flag_munging) AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes") # Implementation of the --disable-amd64-as-feature-detection switch. AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection]) AC_ARG_ENABLE(amd64-as-feature-detection, AC_HELP_STRING([--disable-amd64-as-feature-detection], [Disable the auto-detection of AMD64 as(1) features]), amd64_as_feature_detection=$enableval, amd64_as_feature_detection=yes) AC_MSG_RESULT($amd64_as_feature_detection) AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME", [A human readable text with the name of the OS]) # For some systems we know that we have ld_version scripts. # Use it then as default. have_ld_version_script=no case "${host}" in *-*-linux*) have_ld_version_script=yes ;; *-*-gnu*) have_ld_version_script=yes ;; esac AC_ARG_ENABLE([ld-version-script], AC_HELP_STRING([--enable-ld-version-script], [enable/disable use of linker version script. (default is system dependent)]), [have_ld_version_script=$enableval], [ : ] ) AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes") AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM", [defined to the name of the strong random device]) AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM", [defined to the name of the weaker random device]) ############################### #### Checks for libraries. #### ############################### # # gpg-error is required. # AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION") if test "x$GPG_ERROR_LIBS" = "x"; then AC_MSG_ERROR([libgpg-error is needed. See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .]) fi AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT, [The default error source for libgcrypt.]) # # Check whether the GNU Pth library is available. We require this # to build the optional gcryptrnd program. # AC_ARG_WITH(pth-prefix, AC_HELP_STRING([--with-pth-prefix=PFX], [prefix where GNU Pth is installed (optional)]), pth_config_prefix="$withval", pth_config_prefix="") if test x$pth_config_prefix != x ; then PTH_CONFIG="$pth_config_prefix/bin/pth-config" fi if test "$use_random_daemon" = "yes"; then AC_PATH_PROG(PTH_CONFIG, pth-config, no) if test "$PTH_CONFIG" = "no"; then AC_MSG_WARN([[ *** *** To build the Libgcrypt's random number daemon *** we need the support of the GNU Portable Threads Library. *** Download it from ftp://ftp.gnu.org/gnu/pth/ *** On a Debian GNU/Linux system you might want to try *** apt-get install libpth-dev ***]]) else GNUPG_PTH_VERSION_CHECK([1.3.7]) if test $have_pth = yes; then PTH_CFLAGS=`$PTH_CONFIG --cflags` PTH_LIBS=`$PTH_CONFIG --ldflags` PTH_LIBS="$PTH_LIBS `$PTH_CONFIG --libs --all`" AC_DEFINE(USE_GNU_PTH, 1, [Defined if the GNU Portable Thread Library should be used]) AC_DEFINE(HAVE_PTH, 1, [Defined if the GNU Pth is available]) fi fi fi AC_SUBST(PTH_CFLAGS) AC_SUBST(PTH_LIBS) # # Check whether pthreads is available # if test "$have_w32_system" != yes; then AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes) if test "$have_pthread" = yes; then AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.]) fi fi # Solaris needs -lsocket and -lnsl. Unisys system includes # gethostbyname in libsocket but needs libnsl for socket. AC_SEARCH_LIBS(setsockopt, [socket], , [AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])]) AC_SEARCH_LIBS(setsockopt, [nsl]) ################################## #### Checks for header files. #### ################################## AC_HEADER_STDC AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h) INSERT_SYS_SELECT_H= if test x"$ac_cv_header_sys_select_h" = xyes; then INSERT_SYS_SELECT_H=" include " fi AC_SUBST(INSERT_SYS_SELECT_H) ########################################## #### Checks for typedefs, structures, #### #### and compiler characteristics. #### ########################################## AC_C_CONST AC_C_INLINE AC_TYPE_SIZE_T AC_TYPE_SIGNAL AC_DECL_SYS_SIGLIST AC_TYPE_PID_T GNUPG_CHECK_TYPEDEF(byte, HAVE_BYTE_TYPEDEF) GNUPG_CHECK_TYPEDEF(ushort, HAVE_USHORT_TYPEDEF) GNUPG_CHECK_TYPEDEF(ulong, HAVE_ULONG_TYPEDEF) GNUPG_CHECK_TYPEDEF(u16, HAVE_U16_TYPEDEF) GNUPG_CHECK_TYPEDEF(u32, HAVE_U32_TYPEDEF) gl_TYPE_SOCKLEN_T case "${host}" in *-*-mingw32*) # socklen_t may or may not be defined depending on what headers # are included. To be safe we use int as this is the actual type. FALLBACK_SOCKLEN_T="typedef int gcry_socklen_t;" ;; *) if test ".$gl_cv_socklen_t_equiv" = "."; then FALLBACK_SOCKLEN_T="typedef socklen_t gcry_socklen_t;" else FALLBACK_SOCKLEN_T="typedef ${gl_cv_socklen_t_equiv} gcry_socklen_t;" fi esac AC_SUBST(FALLBACK_SOCKLEN_T) # # Check for __builtin_bswap32 intrinsic. # AC_CACHE_CHECK(for __builtin_bswap32, [gcry_cv_have_builtin_bswap32], [gcry_cv_have_builtin_bswap32=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [int x = 0; int y = __builtin_bswap32(x); return y;])], [gcry_cv_have_builtin_bswap32=yes])]) if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_BSWAP32,1, [Defined if compiler has '__builtin_bswap32' intrinsic]) fi # # Check for __builtin_bswap64 intrinsic. # AC_CACHE_CHECK(for __builtin_bswap64, [gcry_cv_have_builtin_bswap64], [gcry_cv_have_builtin_bswap64=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [long long x = 0; long long y = __builtin_bswap64(x); return y;])], [gcry_cv_have_builtin_bswap64=yes])]) if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_BSWAP64,1, [Defined if compiler has '__builtin_bswap64' intrinsic]) fi # # Check for __builtin_ctz intrinsic. # AC_CACHE_CHECK(for __builtin_ctz, [gcry_cv_have_builtin_ctz], [gcry_cv_have_builtin_ctz=no AC_LINK_IFELSE([AC_LANG_PROGRAM([], [unsigned int x = 0; int y = __builtin_ctz(x); return y;])], [gcry_cv_have_builtin_ctz=yes])]) if test "$gcry_cv_have_builtin_ctz" = "yes" ; then AC_DEFINE(HAVE_BUILTIN_CTZ, 1, [Defined if compiler has '__builtin_ctz' intrinsic]) fi # # Check for VLA support (variable length arrays). # AC_CACHE_CHECK(whether the variable length arrays are supported, [gcry_cv_have_vla], [gcry_cv_have_vla=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void f1(char *, int); char foo(int i) { char b[(i < 0 ? 0 : i) + 1]; f1(b, sizeof b); return b[0];}]])], [gcry_cv_have_vla=yes])]) if test "$gcry_cv_have_vla" = "yes" ; then AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported]) fi # # Check for ELF visibility support. # AC_CACHE_CHECK(whether the visibility attribute is supported, gcry_cv_visibility_attribute, [gcry_cv_visibility_attribute=no AC_LANG_CONFTEST([AC_LANG_SOURCE( [[int foo __attribute__ ((visibility ("hidden"))) = 1; int bar __attribute__ ((visibility ("protected"))) = 1; ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then gcry_cv_visibility_attribute=yes fi fi fi ]) if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(for broken visibility attribute, gcry_cv_broken_visibility_attribute, [gcry_cv_broken_visibility_attribute=yes AC_LANG_CONFTEST([AC_LANG_SOURCE( [[int foo (int x); int bar (int x) __asm__ ("foo") __attribute__ ((visibility ("hidden"))); int bar (int x) { return x; } ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep '\.hidden@<:@ _@:>@foo' conftest.s >/dev/null 2>&1; then gcry_cv_broken_visibility_attribute=no fi fi ]) fi if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(for broken alias attribute, gcry_cv_broken_alias_attribute, [gcry_cv_broken_alias_attribute=yes AC_LANG_CONFTEST([AC_LANG_SOURCE( [[extern int foo (int x) __asm ("xyzzy"); int bar (int x) { return x; } extern __typeof (bar) foo __attribute ((weak, alias ("bar"))); extern int dfoo; extern __typeof (dfoo) dfoo __asm ("abccb"); int dfoo = 1; ]])]) if ${CC-cc} -Werror -S conftest.c -o conftest.s \ 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \ grep 'abccb' conftest.s >/dev/null 2>&1; then gcry_cv_broken_alias_attribute=no fi fi ]) fi if test "$gcry_cv_visibility_attribute" = "yes"; then AC_CACHE_CHECK(if gcc supports -fvisibility=hidden, gcry_cv_gcc_has_f_visibility, [gcry_cv_gcc_has_f_visibility=no _gcc_cflags_save=$CFLAGS CFLAGS="-fvisibility=hidden" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])], gcry_cv_gcc_has_f_visibility=yes) CFLAGS=$_gcc_cflags_save; ]) fi if test "$gcry_cv_visibility_attribute" = "yes" \ && test "$gcry_cv_broken_visibility_attribute" != "yes" \ && test "$gcry_cv_broken_alias_attribute" != "yes" \ && test "$gcry_cv_gcc_has_f_visibility" = "yes" then AC_DEFINE(GCRY_USE_VISIBILITY, 1, [Define to use the GNU C visibility attribute.]) CFLAGS="$CFLAGS -fvisibility=hidden" fi # Following attribute tests depend on warnings to cause compile to fail, # so set -Werror temporarily. _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -Werror" # # Check whether the compiler supports the GCC style aligned attribute # AC_CACHE_CHECK([whether the GCC style aligned attribute is supported], [gcry_cv_gcc_attribute_aligned], [gcry_cv_gcc_attribute_aligned=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[struct { int a; } foo __attribute__ ((aligned (16)));]])], [gcry_cv_gcc_attribute_aligned=yes])]) if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1, [Defined if a GCC style "__attribute__ ((aligned (n))" is supported]) fi # # Check whether the compiler supports the GCC style packed attribute # AC_CACHE_CHECK([whether the GCC style packed attribute is supported], [gcry_cv_gcc_attribute_packed], [gcry_cv_gcc_attribute_packed=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[struct foolong_s { long b; } __attribute__ ((packed)); struct foo_s { char a; struct foolong_s b; } __attribute__ ((packed)); enum bar { FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))), };]])], [gcry_cv_gcc_attribute_packed=yes])]) if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1, [Defined if a GCC style "__attribute__ ((packed))" is supported]) fi # # Check whether the compiler supports the GCC style may_alias attribute # AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported], [gcry_cv_gcc_attribute_may_alias], [gcry_cv_gcc_attribute_may_alias=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[typedef struct foo_s { int a; } __attribute__ ((may_alias)) foo_t;]])], [gcry_cv_gcc_attribute_may_alias=yes])]) if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1, [Defined if a GCC style "__attribute__ ((may_alias))" is supported]) fi # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether the compiler supports 'asm' or '__asm__' keyword for # assembler blocks. # AC_CACHE_CHECK([whether 'asm' assembler keyword is supported], [gcry_cv_have_asm], [gcry_cv_have_asm=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { asm("":::"memory"); }]])], [gcry_cv_have_asm=yes])]) AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported], [gcry_cv_have___asm__], [gcry_cv_have___asm__=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__("":::"memory"); }]])], [gcry_cv_have___asm__=yes])]) if test "$gcry_cv_have_asm" = "no" ; then if test "$gcry_cv_have___asm__" = "yes" ; then AC_DEFINE(asm,__asm__, [Define to supported assembler block keyword, if plain 'asm' was not supported]) fi fi # # Check whether the compiler supports inline assembly memory barrier. # if test "$gcry_cv_have_asm" = "no" ; then if test "$gcry_cv_have___asm__" = "yes" ; then AC_CACHE_CHECK([whether inline assembly memory barrier is supported], [gcry_cv_have_asm_volatile_memory], [gcry_cv_have_asm_volatile_memory=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__ volatile("":::"memory"); }]])], [gcry_cv_have_asm_volatile_memory=yes])]) fi else AC_CACHE_CHECK([whether inline assembly memory barrier is supported], [gcry_cv_have_asm_volatile_memory], [gcry_cv_have_asm_volatile_memory=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { asm volatile("":::"memory"); }]])], [gcry_cv_have_asm_volatile_memory=yes])]) fi if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1, [Define if inline asm memory barrier is supported]) fi # # Check whether GCC assembler supports features needed for our ARM # implementations. This needs to be done before setting up the # assembler stuff. # AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations], [gcry_cv_gcc_arm_platform_as_ok], [gcry_cv_gcc_arm_platform_as_ok=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( /* Test if assembler supports UAL syntax. */ ".syntax unified\n\t" ".arm\n\t" /* our assembly code is in ARM mode */ /* Following causes error if assembler ignored '.syntax unified'. */ "asmfunc:\n\t" "add %r0, %r0, %r4, ror #12;\n\t" /* Test if '.type' and '.size' are supported. */ ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,%function;\n\t" );]])], [gcry_cv_gcc_arm_platform_as_ok=yes])]) if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1, [Defined if underlying assembler is compatible with ARM assembly implementations]) fi # # Check whether GCC assembler supports features needed for our ARMv8/Aarch64 # implementations. This needs to be done before setting up the # assembler stuff. # AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations], [gcry_cv_gcc_aarch64_platform_as_ok], [gcry_cv_gcc_aarch64_platform_as_ok=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( "asmfunc:\n\t" "eor x0, x0, x30, ror #12;\n\t" "add x0, x0, x30, asr #12;\n\t" "eor v0.16b, v0.16b, v31.16b;\n\t" /* Test if '.type' and '.size' are supported. */ ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,@function;\n\t" );]])], [gcry_cv_gcc_aarch64_platform_as_ok=yes])]) if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations]) fi # # Check whether underscores in symbols are required. This needs to be # done before setting up the assembler stuff. # GNUPG_SYS_SYMBOL_UNDERSCORE() ################################# #### #### #### Setup assembler stuff. #### #### Define mpi_cpu_arch. #### #### #### ################################# AC_ARG_ENABLE(mpi-path, AC_HELP_STRING([--enable-mpi-path=EXTRA_PATH], [prepend EXTRA_PATH to list of CPU specific optimizations]), mpi_extra_path="$enableval",mpi_extra_path="") AC_MSG_CHECKING(architecture and mpi assembler functions) if test -f $srcdir/mpi/config.links ; then . $srcdir/mpi/config.links AC_CONFIG_LINKS("$mpi_ln_list") ac_cv_mpi_sflags="$mpi_sflags" AC_MSG_RESULT($mpi_cpu_arch) else AC_MSG_RESULT(failed) AC_MSG_ERROR([mpi/config.links missing!]) fi MPI_SFLAGS="$ac_cv_mpi_sflags" AC_SUBST(MPI_SFLAGS) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes) AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes) AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes) AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes) AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes) AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes) AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes) # Reset non applicable feature flags. if test "$mpi_cpu_arch" != "x86" ; then aesnisupport="n/a" pclmulsupport="n/a" sse41support="n/a" avxsupport="n/a" avx2support="n/a" padlocksupport="n/a" jentsupport="n/a" drngsupport="n/a" fi if test "$mpi_cpu_arch" != "arm" ; then if test "$mpi_cpu_arch" != "aarch64" ; then neonsupport="n/a" armcryptosupport="n/a" fi fi ############################################# #### #### #### Platform specific compiler checks. #### #### #### ############################################# # Following tests depend on warnings to cause compile to fail, so set -Werror # temporarily. _gcc_cflags_save=$CFLAGS CFLAGS="$CFLAGS -Werror" # # Check whether compiler supports 'ms_abi' function attribute. # AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute], [gcry_cv_gcc_attribute_ms_abi], [gcry_cv_gcc_attribute_ms_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[int __attribute__ ((ms_abi)) proto(int);]])], [gcry_cv_gcc_attribute_ms_abi=yes])]) if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1, [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute]) fi # # Check whether compiler supports 'sysv_abi' function attribute. # AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute], [gcry_cv_gcc_attribute_sysv_abi], [gcry_cv_gcc_attribute_sysv_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[int __attribute__ ((sysv_abi)) proto(int);]])], [gcry_cv_gcc_attribute_sysv_abi=yes])]) if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1, [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute]) fi # # Check whether default calling convention is 'ms_abi'. # if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then AC_CACHE_CHECK([whether default calling convention is 'ms_abi'], [gcry_cv_gcc_default_abi_is_ms_abi], [gcry_cv_gcc_default_abi_is_ms_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void *test(void) { void *(*def_func)(void) = test; void *__attribute__((ms_abi))(*msabi_func)(void); /* warning on SysV abi targets, passes on Windows based targets */ msabi_func = def_func; return msabi_func; }]])], [gcry_cv_gcc_default_abi_is_ms_abi=yes])]) if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1, [Defined if default calling convention is 'ms_abi']) fi fi # # Check whether default calling convention is 'sysv_abi'. # if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'], [gcry_cv_gcc_default_abi_is_sysv_abi], [gcry_cv_gcc_default_abi_is_sysv_abi=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void *test(void) { void *(*def_func)(void) = test; void *__attribute__((sysv_abi))(*sysvabi_func)(void); /* warning on MS ABI targets, passes on SysV ABI targets */ sysvabi_func = def_func; return sysvabi_func; }]])], [gcry_cv_gcc_default_abi_is_sysv_abi=yes])]) if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1, [Defined if default calling convention is 'sysv_abi']) fi fi # Restore flags. CFLAGS=$_gcc_cflags_save; # # Check whether GCC inline assembler supports SSSE3 instructions # This is required for the AES-NI instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions], [gcry_cv_gcc_inline_asm_ssse3], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_inline_asm_ssse3="n/a" else gcry_cv_gcc_inline_asm_ssse3=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; void a(void) { __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):); }]])], [gcry_cv_gcc_inline_asm_ssse3=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1, [Defined if inline assembler supports SSSE3 instructions]) fi # # Check whether GCC inline assembler supports PCLMUL instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions], [gcry_cv_gcc_inline_asm_pclmul], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_inline_asm_pclmul="n/a" else gcry_cv_gcc_inline_asm_pclmul=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc"); }]])], [gcry_cv_gcc_inline_asm_pclmul=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1, [Defined if inline assembler supports PCLMUL instructions]) fi # # Check whether GCC inline assembler supports SSE4.1 instructions. # AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions], [gcry_cv_gcc_inline_asm_sse41], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_inline_asm_sse41="n/a" else gcry_cv_gcc_inline_asm_sse41=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { int i; __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i)); }]])], [gcry_cv_gcc_inline_asm_sse41=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1, [Defined if inline assembler supports SSE4.1 instructions]) fi # # Check whether GCC inline assembler supports AVX instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions], [gcry_cv_gcc_inline_asm_avx], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_inline_asm_avx="n/a" else gcry_cv_gcc_inline_asm_avx=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):); }]])], [gcry_cv_gcc_inline_asm_avx=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1, [Defined if inline assembler supports AVX instructions]) fi # # Check whether GCC inline assembler supports AVX2 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions], [gcry_cv_gcc_inline_asm_avx2], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_inline_asm_avx2="n/a" else gcry_cv_gcc_inline_asm_avx2=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); }]])], [gcry_cv_gcc_inline_asm_avx2=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1, [Defined if inline assembler supports AVX2 instructions]) fi # # Check whether GCC inline assembler supports BMI2 instructions # AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions], [gcry_cv_gcc_inline_asm_bmi2], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_inline_asm_bmi2="n/a" else gcry_cv_gcc_inline_asm_bmi2=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { __asm__("rorxl \$23, %%eax, %%edx\\n\\t":::"memory"); }]])], [gcry_cv_gcc_inline_asm_bmi2=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1, [Defined if inline assembler supports BMI2 instructions]) fi # # Check whether GCC assembler needs "-Wa,--divide" to correctly handle # constant division # if test $amd64_as_feature_detection = yes; then AC_CACHE_CHECK([whether GCC assembler handles division correctly], [gcry_cv_gcc_as_const_division_ok], [gcry_cv_gcc_as_const_division_ok=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])], [gcry_cv_gcc_as_const_division_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then # # Add '-Wa,--divide' to CPPFLAGS and try check again. # _gcc_cppflags_save="$CPPFLAGS" CPPFLAGS="$CPPFLAGS -Wa,--divide" AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"], [gcry_cv_gcc_as_const_division_with_wadivide_ok], [gcry_cv_gcc_as_const_division_with_wadivide_ok=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])], [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then # '-Wa,--divide' did not work, restore old flags. CPPFLAGS="$_gcc_cppflags_save" fi fi fi # # Check whether GCC assembler supports features needed for our amd64 # implementations # if test $amd64_as_feature_detection = yes; then AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations], [gcry_cv_gcc_amd64_platform_as_ok], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_amd64_platform_as_ok="n/a" else gcry_cv_gcc_amd64_platform_as_ok=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( /* Test if '.type' and '.size' are supported. */ /* These work only on ELF targets. */ "asmfunc:\n\t" ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,@function;\n\t" /* Test if assembler allows use of '/' for constant division * (Solaris/x86 issue). If previous constant division check * and "-Wa,--divide" workaround failed, this causes assembly * to be disable on this machine. */ "xorl \$(123456789/12345678), %ebp;\n\t" );]])], [gcry_cv_gcc_amd64_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with amd64 assembly implementations]) fi if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" && test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" && test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations], [gcry_cv_gcc_win64_platform_as_ok], [gcry_cv_gcc_win64_platform_as_ok=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( ".globl asmfunc\n\t" "asmfunc:\n\t" "xorq \$(1234), %rbp;\n\t" );]])], [gcry_cv_gcc_win64_platform_as_ok=yes])]) if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1, [Defined if underlying assembler is compatible with WIN64 assembly implementations]) fi fi fi # # Check whether GCC assembler supports features needed for assembly # implementations that use Intel syntax # AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations], [gcry_cv_gcc_platform_as_ok_for_intel_syntax], [if test "$mpi_cpu_arch" != "x86" ; then gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a" else gcry_cv_gcc_platform_as_ok_for_intel_syntax=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( ".intel_syntax noprefix\n\t" "pxor xmm1, xmm7;\n\t" /* Intel syntax implementation also use GAS macros, so check * for them here. */ "VAL_A = xmm4\n\t" "VAL_B = xmm2\n\t" ".macro SET_VAL_A p1\n\t" " VAL_A = \\\\p1 \n\t" ".endm\n\t" ".macro SET_VAL_B p1\n\t" " VAL_B = \\\\p1 \n\t" ".endm\n\t" "vmovdqa VAL_A, VAL_B;\n\t" "SET_VAL_A eax\n\t" "SET_VAL_B ebp\n\t" "add VAL_A, VAL_B;\n\t" "add VAL_B, 0b10101;\n\t" );]])], [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes]) fi]) if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1, [Defined if underlying assembler is compatible with Intel syntax assembly implementations]) fi # # Check whether compiler is configured for ARMv6 or newer architecture # AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture], [gcry_cv_cc_arm_arch_is_v6], [if test "$mpi_cpu_arch" != "arm" ; then gcry_cv_cc_arm_arch_is_v6="n/a" else gcry_cv_cc_arm_arch_is_v6=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[ #if defined(__arm__) && \ ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ || defined(__ARM_ARCH_7EM__)) /* empty */ #else /* fail compile if not ARMv6. */ not_armv6 not_armv6 = (not_armv6)not_armv6; #endif ]])], [gcry_cv_cc_arm_arch_is_v6=yes]) fi]) if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then AC_DEFINE(HAVE_ARM_ARCH_V6,1, [Defined if ARM architecture is v6 or newer]) fi # # Check whether GCC inline assembler supports NEON instructions # AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions], [gcry_cv_gcc_inline_asm_neon], [if test "$mpi_cpu_arch" != "arm" ; then gcry_cv_gcc_inline_asm_neon="n/a" else gcry_cv_gcc_inline_asm_neon=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( ".syntax unified\n\t" ".arm\n\t" ".fpu neon\n\t" "vld1.64 {%q0-%q1}, [%r0]!;\n\t" "vrev64.8 %q0, %q3;\n\t" "vadd.u64 %q0, %q1;\n\t" "vadd.s64 %d3, %d2, %d3;\n\t" ); ]])], [gcry_cv_gcc_inline_asm_neon=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1, [Defined if inline assembler supports NEON instructions]) fi # # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions], [gcry_cv_gcc_inline_asm_aarch32_crypto], [if test "$mpi_cpu_arch" != "arm" ; then gcry_cv_gcc_inline_asm_aarch32_crypto="n/a" else gcry_cv_gcc_inline_asm_aarch32_crypto=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( ".syntax unified\n\t" ".arch armv8-a\n\t" ".arm\n\t" ".fpu crypto-neon-fp-armv8\n\t" "sha1h.32 q0, q0;\n\t" "sha1c.32 q0, q0, q0;\n\t" "sha1p.32 q0, q0, q0;\n\t" "sha1su0.32 q0, q0, q0;\n\t" "sha1su1.32 q0, q0;\n\t" "sha256h.32 q0, q0, q0;\n\t" "sha256h2.32 q0, q0, q0;\n\t" "sha1p.32 q0, q0, q0;\n\t" "sha256su0.32 q0, q0;\n\t" "sha256su1.32 q0, q0, q15;\n\t" "aese.8 q0, q0;\n\t" "aesd.8 q0, q0;\n\t" "aesmc.8 q0, q0;\n\t" "aesimc.8 q0, q0;\n\t" "vmull.p64 q0, d0, d0;\n\t" ); ]])], [gcry_cv_gcc_inline_asm_aarch32_crypto=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1, [Defined if inline assembler supports AArch32 Crypto Extension instructions]) fi # # Check whether GCC inline assembler supports AArch64 NEON instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions], [gcry_cv_gcc_inline_asm_aarch64_neon], [if test "$mpi_cpu_arch" != "aarch64" ; then gcry_cv_gcc_inline_asm_aarch64_neon="n/a" else gcry_cv_gcc_inline_asm_aarch64_neon=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( ".cpu generic+simd\n\t" "mov w0, \#42;\n\t" "dup v0.8b, w0;\n\t" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t" ); ]])], [gcry_cv_gcc_inline_asm_aarch64_neon=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1, [Defined if inline assembler supports AArch64 NEON instructions]) fi # # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions # AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions], [gcry_cv_gcc_inline_asm_aarch64_crypto], [if test "$mpi_cpu_arch" != "aarch64" ; then gcry_cv_gcc_inline_asm_aarch64_crypto="n/a" else gcry_cv_gcc_inline_asm_aarch64_crypto=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[__asm__( ".cpu generic+simd+crypto\n\t" "mov w0, \#42;\n\t" "dup v0.8b, w0;\n\t" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t" "sha1h s0, s0;\n\t" "sha1c q0, s0, v0.4s;\n\t" "sha1p q0, s0, v0.4s;\n\t" "sha1su0 v0.4s, v0.4s, v0.4s;\n\t" "sha1su1 v0.4s, v0.4s;\n\t" "sha256h q0, q0, v0.4s;\n\t" "sha256h2 q0, q0, v0.4s;\n\t" "sha1p q0, s0, v0.4s;\n\t" "sha256su0 v0.4s, v0.4s;\n\t" "sha256su1 v0.4s, v0.4s, v31.4s;\n\t" "aese v0.16b, v0.16b;\n\t" "aesd v0.16b, v0.16b;\n\t" "aesmc v0.16b, v0.16b;\n\t" "aesimc v0.16b, v0.16b;\n\t" "pmull v0.1q, v0.1d, v31.1d;\n\t" "pmull2 v0.1q, v0.2d, v31.2d;\n\t" ); ]])], [gcry_cv_gcc_inline_asm_aarch64_crypto=yes]) fi]) if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1, [Defined if inline assembler supports AArch64 Crypto Extension instructions]) fi ####################################### #### Checks for library functions. #### ####################################### AC_FUNC_VPRINTF # We have replacements for these in src/missing-string.c AC_CHECK_FUNCS(stpcpy strcasecmp) # We have replacements for these in src/g10lib.h AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise) # Other checks AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4) AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog) AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile) GNUPG_CHECK_MLOCK # # Replacement functions. # AC_REPLACE_FUNCS([getpid clock]) # # Check whether it is necessary to link against libdl. # DL_LIBS="" if test "$use_hmac_binary_check" = yes ; then _gcry_save_libs="$LIBS" LIBS="" AC_SEARCH_LIBS(dlopen, c dl,,,) DL_LIBS=$LIBS LIBS="$_gcry_save_libs" LIBGCRYPT_CONFIG_LIBS="${LIBGCRYPT_CONFIG_LIBS} ${DL_LIBS}" fi AC_SUBST(DL_LIBS) # # Check whether we can use Linux capabilities as requested. # if test "$use_capabilities" = "yes" ; then use_capabilities=no AC_CHECK_HEADERS(sys/capability.h) if test "$ac_cv_header_sys_capability_h" = "yes" ; then AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1) if test "$ac_cv_lib_cap_cap_init" = "yes"; then AC_DEFINE(USE_CAPABILITIES,1, [define if capabilities should be used]) LIBS="$LIBS -lcap" use_capabilities=yes fi fi if test "$use_capabilities" = "no" ; then AC_MSG_WARN([[ *** *** The use of capabilities on this system is not possible. *** You need a recent Linux kernel and some patches: *** fcaps-2.2.9-990610.patch (kernel patch for 2.2.9) *** fcap-module-990613.tar.gz (kernel module) *** libcap-1.92.tar.gz (user mode library and utilities) *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN *** set (filesystems menu). Be warned: This code is *really* ALPHA. ***]]) fi fi # Check whether a random device is available. if test "$try_dev_random" = yes ; then AC_CACHE_CHECK(for random device, ac_cv_have_dev_random, [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi]) if test "$ac_cv_have_dev_random" = yes; then AC_DEFINE(HAVE_DEV_RANDOM,1, [defined if the system supports a random device] ) fi else AC_MSG_CHECKING(for random device) ac_cv_have_dev_random=no AC_MSG_RESULT(has been disabled) fi # Figure out the random modules for this configuration. if test "$random" = "default"; then # Select default value. if test "$ac_cv_have_dev_random" = yes; then # Try Linuxish random device. random_modules="linux" else case "${host}" in *-*-mingw32ce*) # WindowsCE random device. random_modules="w32ce" ;; *-*-mingw32*|*-*-cygwin*) # Windows random device. random_modules="w32" ;; *) # Build everything, allow to select at runtime. random_modules="$auto_random_modules" ;; esac fi else if test "$random" = "auto"; then # Build everything, allow to select at runtime. random_modules="$auto_random_modules" else random_modules="$random" fi fi # # Other defines # if test mym4_isgit = "yes"; then AC_DEFINE(IS_DEVELOPMENT_VERSION,1, [Defined if this is not a regular release]) fi AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes) # This is handy for debugging so the compiler doesn't rearrange # things and eliminate variables. AC_ARG_ENABLE(optimization, AC_HELP_STRING([--disable-optimization], [disable compiler optimization]), [if test $enableval = no ; then CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'` fi]) # CFLAGS mangling when using gcc. if test "$GCC" = yes; then CFLAGS="$CFLAGS -Wall" if test "$USE_MAINTAINER_MODE" = "yes"; then CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes" CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security" # If -Wno-missing-field-initializers is supported we can enable a # a bunch of really useful warnings. AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers]) _gcc_cflags_save=$CFLAGS CFLAGS="-Wno-missing-field-initializers" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast" CFLAGS="$CFLAGS -Wwrite-strings" CFLAGS="$CFLAGS -Wdeclaration-after-statement" CFLAGS="$CFLAGS -Wno-missing-field-initializers" CFLAGS="$CFLAGS -Wno-sign-compare" fi AC_MSG_CHECKING([if gcc supports -Wpointer-arith]) _gcc_cflags_save=$CFLAGS CFLAGS="-Wpointer-arith" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no) AC_MSG_RESULT($_gcc_wopt) CFLAGS=$_gcc_cflags_save; if test x"$_gcc_wopt" = xyes ; then CFLAGS="$CFLAGS -Wpointer-arith" fi fi fi # Check whether as(1) supports a noeexecstack feature. This test # includes an override option. CL_AS_NOEXECSTACK AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION) AC_SUBST(LIBGCRYPT_CONFIG_LIBS) AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS) AC_SUBST(LIBGCRYPT_CONFIG_HOST) AC_SUBST(LIBGCRYPT_THREAD_MODULES) AC_CONFIG_COMMANDS([gcrypt-conf],[[ chmod +x src/libgcrypt-config ]],[[ prefix=$prefix exec_prefix=$exec_prefix libdir=$libdir datadir=$datadir DATADIRNAME=$DATADIRNAME ]]) ##################### #### Conclusion. #### ##################### # Check that requested feature can actually be used and define # ENABLE_foo_SUPPORT macros. if test x"$aesnisupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then aesnisupport="no (unsupported by compiler)" fi fi if test x"$pclmulsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then pclmulsupport="no (unsupported by compiler)" fi fi if test x"$sse41support" = xyes ; then if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then sse41support="no (unsupported by compiler)" fi fi if test x"$avxsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then avxsupport="no (unsupported by compiler)" fi fi if test x"$avx2support" = xyes ; then if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then avx2support="no (unsupported by compiler)" fi fi if test x"$neonsupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then neonsupport="no (unsupported by compiler)" fi fi fi if test x"$armcryptosupport" = xyes ; then if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then neonsupport="no (unsupported by compiler)" fi fi fi if test x"$aesnisupport" = xyes ; then AC_DEFINE(ENABLE_AESNI_SUPPORT, 1, [Enable support for Intel AES-NI instructions.]) fi if test x"$pclmulsupport" = xyes ; then AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1, [Enable support for Intel PCLMUL instructions.]) fi if test x"$sse41support" = xyes ; then AC_DEFINE(ENABLE_SSE41_SUPPORT, 1, [Enable support for Intel SSE4.1 instructions.]) fi if test x"$avxsupport" = xyes ; then AC_DEFINE(ENABLE_AVX_SUPPORT,1, [Enable support for Intel AVX instructions.]) fi if test x"$avx2support" = xyes ; then AC_DEFINE(ENABLE_AVX2_SUPPORT,1, [Enable support for Intel AVX2 instructions.]) fi if test x"$neonsupport" = xyes ; then AC_DEFINE(ENABLE_NEON_SUPPORT,1, [Enable support for ARM NEON instructions.]) fi if test x"$armcryptosupport" = xyes ; then AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1, [Enable support for ARMv8 Crypto Extension instructions.]) fi if test x"$jentsupport" = xyes ; then AC_DEFINE(ENABLE_JENT_SUPPORT, 1, [Enable support for the jitter entropy collector.]) fi if test x"$padlocksupport" = xyes ; then AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1, [Enable support for the PadLock engine.]) fi if test x"$drngsupport" = xyes ; then AC_DEFINE(ENABLE_DRNG_SUPPORT, 1, [Enable support for Intel DRNG (RDRAND instruction).]) fi # Define conditional sources and config.h symbols depending on the # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules. LIST_MEMBER(arcfour, $enabled_ciphers) if test "$found" = "1"; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo" AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo" ;; esac fi LIST_MEMBER(blowfish, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo" AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-arm.lo" ;; esac fi LIST_MEMBER(cast5, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo" AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-arm.lo" ;; esac fi LIST_MEMBER(des, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo" AC_DEFINE(USE_DES, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS des-amd64.lo" ;; esac fi LIST_MEMBER(aes, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo" AC_DEFINE(USE_AES, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo" # Build with the SSSE3 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo" # Build with the ARMv8/AArch32 CE implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aarch64.lo" # Build with the ARMv8/AArch64 CE implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo" GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo" ;; esac case "$mpi_cpu_arch" in x86) # Build with the AES-NI implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aesni.lo" # Build with the Padlock implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-padlock.lo" ;; esac fi LIST_MEMBER(twofish, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo" AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo" if test x"$avx2support" = xyes ; then # Build with the AVX2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-avx2-amd64.lo" fi ;; arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-arm.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-aarch64.lo" ;; esac fi LIST_MEMBER(serpent, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo" AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the SSE2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-sse2-amd64.lo" ;; esac if test x"$avx2support" = xyes ; then # Build with the AVX2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-avx2-amd64.lo" fi if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-armv7-neon.lo" fi fi LIST_MEMBER(rfc2268, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo" AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included]) fi LIST_MEMBER(seed, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo" AC_DEFINE(USE_SEED, 1, [Defined if this module should be included]) fi LIST_MEMBER(camellia, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo" AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included]) case "${host}" in arm*-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-arm.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aarch64.lo" ;; esac if test x"$avxsupport" = xyes ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx-amd64.lo" fi fi if test x"$avx2support" = xyes ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX2 implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx2-amd64.lo" fi fi fi LIST_MEMBER(idea, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo" AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included]) fi LIST_MEMBER(salsa20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo" AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-amd64.lo" ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-armv7-neon.lo" fi fi LIST_MEMBER(gost28147, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo" AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included]) fi LIST_MEMBER(chacha20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo" AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation - GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-sse2-amd64.lo" - GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo" - GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo" ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo" fi fi LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" AC_DEFINE(USE_DSA, 1, [Defined if this module should be included]) fi LIST_MEMBER(rsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo" AC_DEFINE(USE_RSA, 1, [Defined if this module should be included]) fi LIST_MEMBER(elgamal, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo" AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included]) fi LIST_MEMBER(ecc, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \ ecc.lo ecc-curves.lo ecc-misc.lo \ ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo" AC_DEFINE(USE_ECC, 1, [Defined if this module should be included]) fi LIST_MEMBER(crc, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo" AC_DEFINE(USE_CRC, 1, [Defined if this module should be included]) case "${host}" in i?86-*-* | x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo" ;; esac fi LIST_MEMBER(gostr3411-94, $enabled_digests) if test "$found" = "1" ; then # GOST R 34.11-94 internally uses GOST 28147-89 LIST_MEMBER(gost28147, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo" AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included]) fi fi LIST_MEMBER(stribog, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo" AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included]) fi LIST_MEMBER(md2, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo" AC_DEFINE(USE_MD2, 1, [Defined if this module should be included]) fi LIST_MEMBER(md4, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo" AC_DEFINE(USE_MD4, 1, [Defined if this module should be included]) fi LIST_MEMBER(md5, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo" AC_DEFINE(USE_MD5, 1, [Defined if this module should be included]) fi LIST_MEMBER(rmd160, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo" AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included]) fi LIST_MEMBER(sha256, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo" AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx2-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch64-ce.lo" ;; esac fi LIST_MEMBER(sha512, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo" AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-arm.lo" ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo" fi fi LIST_MEMBER(sha3, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo" AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation : ;; esac if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo" fi fi LIST_MEMBER(tiger, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo" AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included]) fi LIST_MEMBER(whirlpool, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo" AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool-sse2-amd64.lo" ;; esac fi LIST_MEMBER(blake2, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo" AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included]) fi # SHA-1 needs to be included always for example because it is used by # random-csprng.c. GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo" AC_DEFINE(USE_SHA1, 1, [Defined if this module should be included]) case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv7-neon.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch32-ce.lo" ;; aarch64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch64-ce.lo" ;; esac LIST_MEMBER(sm3, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo" AC_DEFINE(USE_SM3, 1, [Defined if this module should be included]) fi LIST_MEMBER(scrypt, $enabled_kdfs) if test "$found" = "1" ; then GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo" AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included]) fi LIST_MEMBER(linux, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndlinux.lo" AC_DEFINE(USE_RNDLINUX, 1, [Defined if the /dev/random RNG should be used.]) fi LIST_MEMBER(unix, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo" AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.]) fi LIST_MEMBER(egd, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo" AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.]) fi LIST_MEMBER(w32, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo" AC_DEFINE(USE_RNDW32, 1, [Defined if the Windows specific RNG should be used.]) fi LIST_MEMBER(w32ce, $random_modules) if test "$found" = "1" ; then GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo" AC_DEFINE(USE_RNDW32CE, 1, [Defined if the WindowsCE specific RNG should be used.]) fi AC_SUBST([GCRYPT_CIPHERS]) AC_SUBST([GCRYPT_PUBKEY_CIPHERS]) AC_SUBST([GCRYPT_DIGESTS]) AC_SUBST([GCRYPT_KDFS]) AC_SUBST([GCRYPT_RANDOM]) AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers) AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers) AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests) # For printing the configuration we need a colon separated list of # algorithm names. tmp=`echo "$enabled_ciphers" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp", [List of available cipher algorithms]) tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp", [List of available public key cipher algorithms]) tmp=`echo "$enabled_digests" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp", [List of available digest algorithms]) tmp=`echo "$enabled_kdfs" | tr ' ' : ` AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp", [List of available KDF algorithms]) # # Define conditional sources depending on the used hardware platform. # Note that all possible modules must also be listed in # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES). # GCRYPT_HWF_MODULES= case "$mpi_cpu_arch" in x86) AC_DEFINE(HAVE_CPU_ARCH_X86, 1, [Defined for the x86 platforms]) GCRYPT_HWF_MODULES="hwf-x86.lo" ;; alpha) AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms]) ;; sparc) AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms]) ;; mips) AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1, [Defined for MIPS platforms]) ;; m68k) AC_DEFINE(HAVE_CPU_ARCH_M68K, 1, [Defined for M68k platforms]) ;; ppc) AC_DEFINE(HAVE_CPU_ARCH_PPC, 1, [Defined for PPC platforms]) ;; arm) AC_DEFINE(HAVE_CPU_ARCH_ARM, 1, [Defined for ARM platforms]) GCRYPT_HWF_MODULES="hwf-arm.lo" ;; aarch64) AC_DEFINE(HAVE_CPU_ARCH_ARM, 1, [Defined for ARM AArch64 platforms]) GCRYPT_HWF_MODULES="hwf-arm.lo" ;; esac AC_SUBST([GCRYPT_HWF_MODULES]) # # Option to disable building of doc file # build_doc=yes AC_ARG_ENABLE([doc], AC_HELP_STRING([--disable-doc], [do not build the documentation]), build_doc=$enableval, build_doc=yes) AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno]) # # Provide information about the build. # BUILD_REVISION="mym4_revision" AC_SUBST(BUILD_REVISION) AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION", [GIT commit id revision used to build this package]) changequote(,)dnl BUILD_FILEVERSION=`echo "$VERSION" | sed 's/\([0-9.]*\).*/\1./;s/\./,/g'` changequote([,])dnl BUILD_FILEVERSION="${BUILD_FILEVERSION}mym4_revision_dec" AC_SUBST(BUILD_FILEVERSION) AC_ARG_ENABLE([build-timestamp], AC_HELP_STRING([--enable-build-timestamp], [set an explicit build timestamp for reproducibility. (default is the current time in ISO-8601 format)]), [if test "$enableval" = "yes"; then BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date` else BUILD_TIMESTAMP="$enableval" fi], [BUILD_TIMESTAMP=""]) AC_SUBST(BUILD_TIMESTAMP) AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP", [The time this package was configured for a build]) # And create the files. AC_CONFIG_FILES([ Makefile m4/Makefile compat/Makefile mpi/Makefile cipher/Makefile random/Makefile doc/Makefile src/Makefile src/gcrypt.h src/libgcrypt-config src/versioninfo.rc tests/Makefile ]) AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g]) AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf]) AC_OUTPUT detection_module="${GCRYPT_HWF_MODULES%.lo}" test -n "$detection_module" || detection_module="none" # Give some feedback GCRY_MSG_SHOW([],[]) GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:]) GCRY_MSG_SHOW([],[]) GCRY_MSG_SHOW([Platform: ],[$PRINTABLE_OS_NAME ($host)]) GCRY_MSG_SHOW([Hardware detection module:],[$detection_module]) GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers]) GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests]) GCRY_MSG_WRAP([Enabled kdf algorithms: ],[$enabled_kdfs]) GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers]) GCRY_MSG_SHOW([Random number generator: ],[$random]) GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport]) GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities]) GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport]) GCRY_MSG_SHOW([Try using AES-NI crypto: ],[$aesnisupport]) GCRY_MSG_SHOW([Try using Intel PCLMUL: ],[$pclmulsupport]) GCRY_MSG_SHOW([Try using Intel SSE4.1: ],[$sse41support]) GCRY_MSG_SHOW([Try using DRNG (RDRAND): ],[$drngsupport]) GCRY_MSG_SHOW([Try using Intel AVX: ],[$avxsupport]) GCRY_MSG_SHOW([Try using Intel AVX2: ],[$avx2support]) GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport]) GCRY_MSG_SHOW([Try using ARMv8 crypto: ],[$armcryptosupport]) GCRY_MSG_SHOW([],[]) if test "x${gpg_config_script_warn}" != x; then cat <