diff --git a/cipher/Makefile.am b/cipher/Makefile.am index bf13c199..dc63a736 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -1,239 +1,240 @@ # Makefile for cipher modules # Copyright (C) 1998, 1999, 2000, 2001, 2002, # 2003, 2009 Free Software Foundation, Inc. # # This file is part of Libgcrypt. # # Libgcrypt is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of # the License, or (at your option) any later version. # # Libgcrypt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program; if not, see . # Process this file with automake to produce Makefile.in # Need to include ../src in addition to top_srcdir because gcrypt.h is # a built header. AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi AM_CFLAGS = $(GPG_ERROR_CFLAGS) AM_CCASFLAGS = $(NOEXECSTACK_FLAGS) EXTRA_DIST = gost-s-box.c CLEANFILES = gost-s-box DISTCLEANFILES = gost-sb.h noinst_LTLIBRARIES = libcipher.la GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \ @GCRYPT_DIGESTS@ @GCRYPT_KDFS@ libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES) libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c \ cipher-cfb.c \ cipher-ofb.c \ cipher-ctr.c \ cipher-aeswrap.c \ cipher-ccm.c \ cipher-cmac.c \ cipher-gcm.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \ cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ cipher-poly1305.c \ cipher-ocb.c \ cipher-xts.c \ cipher-eax.c \ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ mac.c mac-internal.h \ mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \ poly1305.c poly1305-internal.h \ kdf.c kdf-internal.h \ hmac-tests.c \ bithelp.h \ bufhelp.h \ primegen.c \ hash-common.c hash-common.h \ dsa-common.c rsa-common.c \ sha1.h EXTRA_libcipher_la_SOURCES = \ - asm-common-amd64.h \ asm-common-aarch64.h \ + asm-common-amd64.h \ + asm-poly1305-aarch64.h \ asm-poly1305-amd64.h \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ chacha20-armv7-neon.S chacha20-aarch64.S \ chacha20-ppc.c \ crc.c crc-intel-pclmul.c crc-armv8-ce.c \ crc-armv8-aarch64-ce.S \ crc-ppc.c \ des.c des-amd64.S \ dsa.c \ elgamal.c \ ecc.c ecc-curves.c ecc-misc.c ecc-common.h \ ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c \ idea.c \ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ rijndael.c rijndael-internal.h rijndael-tables.h \ rijndael-aesni.c rijndael-padlock.c \ rijndael-amd64.S rijndael-arm.S \ rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \ rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \ rijndael-armv8-aarch64-ce.S rijndael-aarch64.S \ rijndael-ppc.c \ rmd160.c \ rsa.c \ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \ sha256-avx2-bmi2-amd64.S \ sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \ sha256-intel-shaext.c sha256-ppc.c \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \ sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S sha512-arm.S \ sha512-ppc.c \ sm3.c \ keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c whirlpool-sse2-amd64.S \ twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \ twofish-avx2-amd64.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \ blake2.c \ blake2b-amd64-avx2.S blake2s-amd64-avx.S gost28147.lo: gost-sb.h gost-sb.h: gost-s-box ./gost-s-box $@ gost-s-box: gost-s-box.c $(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \ $(CPPFLAGS_FOR_BUILD)-o $@ $(srcdir)/gost-s-box.c if ENABLE_O_FLAG_MUNGING o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g' else o_flag_munging = cat endif # We need to lower the optimization for this module. tiger.o: $(srcdir)/tiger.c Makefile `echo $(COMPILE) -c $< | $(o_flag_munging) ` tiger.lo: $(srcdir)/tiger.c Makefile `echo $(LTCOMPILE) -c $< | $(o_flag_munging) ` # We need to disable instrumentation for these modules as they use cc as # thin assembly front-end and do not tolerate in-between function calls # inserted by compiler as those functions may clobber the XMM registers. if ENABLE_INSTRUMENTATION_MUNGING instrumentation_munging = sed \ -e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \ -e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \ -e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' else instrumentation_munging = cat endif rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile `echo $(COMPILE) -c $< | $(instrumentation_munging) ` crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) ` if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS ppc_vcrypto_cflags = -maltivec -mvsx -mcrypto else ppc_vcrypto_cflags = endif rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` crc-ppc.o: $(srcdir)/crc-ppc.c Makefile `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` diff --git a/cipher/asm-poly1305-aarch64.h b/cipher/asm-poly1305-aarch64.h new file mode 100644 index 00000000..6c342bee --- /dev/null +++ b/cipher/asm-poly1305-aarch64.h @@ -0,0 +1,245 @@ +/* asm-common-aarch64.h - Poly1305 macros for ARMv8/AArch64 assembly + * + * Copyright (C) 2019 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef GCRY_ASM_POLY1305_AARCH64_H +#define GCRY_ASM_POLY1305_AARCH64_H + +#include "asm-common-aarch64.h" + +#ifdef __AARCH64EL__ + #define le_to_host(reg) /*_*/ +#else + #define le_to_host(reg) rev reg, reg; +#endif + +/********************************************************************** + poly1305 for stitched chacha20-poly1305 Aarch64 implementations + **********************************************************************/ + +#define POLY_RSTATE x8 +#define POLY_RSRC x9 + +#define POLY_R_H0 x10 +#define POLY_R_H1 x11 +#define POLY_R_H2 x12 +#define POLY_R_H2d w12 +#define POLY_R_R0 x13 +#define POLY_R_R1 x14 +#define POLY_R_R1_MUL5 x15 +#define POLY_R_X0_HI x16 +#define POLY_R_X0_LO x17 +#define POLY_R_X1_HI x19 +#define POLY_R_X1_LO x20 +#define POLY_R_ONE x21 +#define POLY_R_ONEd w21 + +#define POLY_TMP0 x22 +#define POLY_TMP1 x23 +#define POLY_TMP2 x24 +#define POLY_TMP3 x25 + +#define POLY_CHACHA_ROUND x26 + +#define POLY_S_R0 (4 * 4 + 0 * 8) +#define POLY_S_R1 (4 * 4 + 1 * 8) +#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8) +#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8) +#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8) + +#define POLY1305_PUSH_REGS() \ + stp x19, x20, [sp, #-16]!; \ + CFI_ADJUST_CFA_OFFSET(16); \ + CFI_REG_ON_STACK(19, 0); \ + CFI_REG_ON_STACK(20, 8); \ + stp x21, x22, [sp, #-16]!; \ + CFI_ADJUST_CFA_OFFSET(16); \ + CFI_REG_ON_STACK(21, 0); \ + CFI_REG_ON_STACK(22, 8); \ + stp x23, x24, [sp, #-16]!; \ + CFI_ADJUST_CFA_OFFSET(16); \ + CFI_REG_ON_STACK(23, 0); \ + CFI_REG_ON_STACK(24, 8); \ + stp x25, x26, [sp, #-16]!; \ + CFI_ADJUST_CFA_OFFSET(16); \ + CFI_REG_ON_STACK(25, 0); \ + CFI_REG_ON_STACK(26, 8); + +#define POLY1305_POP_REGS() \ + ldp x25, x26, [sp], #16; \ + CFI_ADJUST_CFA_OFFSET(-16); \ + CFI_RESTORE(x25); \ + CFI_RESTORE(x26); \ + ldp x23, x24, [sp], #16; \ + CFI_ADJUST_CFA_OFFSET(-16); \ + CFI_RESTORE(x23); \ + CFI_RESTORE(x24); \ + ldp x21, x22, [sp], #16; \ + CFI_ADJUST_CFA_OFFSET(-16); \ + CFI_RESTORE(x21); \ + CFI_RESTORE(x22); \ + ldp x19, x20, [sp], #16; \ + CFI_ADJUST_CFA_OFFSET(-16); \ + CFI_RESTORE(x19); \ + CFI_RESTORE(x20); + +#define POLY1305_LOAD_STATE() \ + ldr POLY_R_R1, [POLY_RSTATE, #(POLY_S_R1)]; \ + ldr POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \ + ldr POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \ + ldr POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)]; \ + ldr POLY_R_R0, [POLY_RSTATE, #(POLY_S_R0)]; \ + add POLY_R_R1_MUL5, POLY_R_R1, POLY_R_R1, lsr #2; \ + mov POLY_R_ONE, #1; + +#define POLY1305_STORE_STATE() \ + str POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \ + str POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \ + str POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)]; + +#define POLY1305_BLOCK_PART1(src_offset) \ + /* a = h + m */ \ + ldr POLY_TMP0, [POLY_RSRC, #((src_offset) + 0 * 8)]; +#define POLY1305_BLOCK_PART2(src_offset) \ + ldr POLY_TMP1, [POLY_RSRC, #((src_offset) + 1 * 8)]; +#define POLY1305_BLOCK_PART3() \ + le_to_host(POLY_TMP0); +#define POLY1305_BLOCK_PART4() \ + le_to_host(POLY_TMP1); +#define POLY1305_BLOCK_PART5() \ + adds POLY_R_H0, POLY_R_H0, POLY_TMP0; +#define POLY1305_BLOCK_PART6() \ + adcs POLY_R_H1, POLY_R_H1, POLY_TMP1; +#define POLY1305_BLOCK_PART7() \ + adc POLY_R_H2d, POLY_R_H2d, POLY_R_ONEd; + +#define POLY1305_BLOCK_PART8() \ + /* h = a * r (partial mod 2^130-5): */ \ + mul POLY_R_X1_LO, POLY_R_H0, POLY_R_R1; /* lo: h0 * r1 */ +#define POLY1305_BLOCK_PART9() \ + mul POLY_TMP0, POLY_R_H1, POLY_R_R0; /* lo: h1 * r0 */ +#define POLY1305_BLOCK_PART10() \ + mul POLY_R_X0_LO, POLY_R_H0, POLY_R_R0; /* lo: h0 * r0 */ +#define POLY1305_BLOCK_PART11() \ + umulh POLY_R_X1_HI, POLY_R_H0, POLY_R_R1; /* hi: h0 * r1 */ +#define POLY1305_BLOCK_PART12() \ + adds POLY_R_X1_LO, POLY_R_X1_LO, POLY_TMP0; +#define POLY1305_BLOCK_PART13() \ + umulh POLY_TMP1, POLY_R_H1, POLY_R_R0; /* hi: h1 * r0 */ +#define POLY1305_BLOCK_PART14() \ + mul POLY_TMP2, POLY_R_H1, POLY_R_R1_MUL5; /* lo: h1 * r1 mod 2^130-5 */ +#define POLY1305_BLOCK_PART15() \ + umulh POLY_R_X0_HI, POLY_R_H0, POLY_R_R0; /* hi: h0 * r0 */ +#define POLY1305_BLOCK_PART16() \ + adc POLY_R_X1_HI, POLY_R_X1_HI, POLY_TMP1; +#define POLY1305_BLOCK_PART17() \ + umulh POLY_TMP3, POLY_R_H1, POLY_R_R1_MUL5; /* hi: h1 * r1 mod 2^130-5 */ +#define POLY1305_BLOCK_PART18() \ + adds POLY_R_X0_LO, POLY_R_X0_LO, POLY_TMP2; +#define POLY1305_BLOCK_PART19() \ + mul POLY_R_H1, POLY_R_H2, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */ +#define POLY1305_BLOCK_PART20() \ + adc POLY_R_X0_HI, POLY_R_X0_HI, POLY_TMP3; +#define POLY1305_BLOCK_PART21() \ + mul POLY_R_H2, POLY_R_H2, POLY_R_R0; /* h2 * r0 */ +#define POLY1305_BLOCK_PART22() \ + adds POLY_R_H1, POLY_R_H1, POLY_R_X1_LO; +#define POLY1305_BLOCK_PART23() \ + adc POLY_R_H0, POLY_R_H2, POLY_R_X1_HI; + +#define POLY1305_BLOCK_PART24() \ + /* carry propagation */ \ + and POLY_R_H2, POLY_R_H0, #3; +#define POLY1305_BLOCK_PART25() \ + mov POLY_R_H0, POLY_R_H0, lsr #2; +#define POLY1305_BLOCK_PART26() \ + add POLY_R_H0, POLY_R_H0, POLY_R_H0, lsl #2; +#define POLY1305_BLOCK_PART27() \ + adds POLY_R_H0, POLY_R_H0, POLY_R_X0_LO; +#define POLY1305_BLOCK_PART28() \ + adcs POLY_R_H1, POLY_R_H1, POLY_R_X0_HI; +#define POLY1305_BLOCK_PART29() \ + adc POLY_R_H2d, POLY_R_H2d, wzr; + +//#define TESTING_POLY1305_ASM +#ifdef TESTING_POLY1305_ASM +/* for testing only. */ +.align 3 +.globl _gcry_poly1305_aarch64_blocks1 +ELF(.type _gcry_poly1305_aarch64_blocks1,%function;) +_gcry_poly1305_aarch64_blocks1: + /* input: + * x0: poly1305-state + * x1: src + * x2: nblks + */ + CFI_STARTPROC() + POLY1305_PUSH_REGS(); + + mov POLY_RSTATE, x0; + mov POLY_RSRC, x1; + + POLY1305_LOAD_STATE(); + +.L_gcry_poly1305_aarch64_loop1: + POLY1305_BLOCK_PART1(0 * 16); + POLY1305_BLOCK_PART2(0 * 16); + add POLY_RSRC, POLY_RSRC, #16; + POLY1305_BLOCK_PART3(); + POLY1305_BLOCK_PART4(); + POLY1305_BLOCK_PART5(); + POLY1305_BLOCK_PART6(); + POLY1305_BLOCK_PART7(); + POLY1305_BLOCK_PART8(); + POLY1305_BLOCK_PART9(); + POLY1305_BLOCK_PART10(); + POLY1305_BLOCK_PART11(); + POLY1305_BLOCK_PART12(); + POLY1305_BLOCK_PART13(); + POLY1305_BLOCK_PART14(); + POLY1305_BLOCK_PART15(); + POLY1305_BLOCK_PART16(); + POLY1305_BLOCK_PART17(); + POLY1305_BLOCK_PART18(); + POLY1305_BLOCK_PART19(); + POLY1305_BLOCK_PART20(); + POLY1305_BLOCK_PART21(); + POLY1305_BLOCK_PART22(); + POLY1305_BLOCK_PART23(); + POLY1305_BLOCK_PART24(); + POLY1305_BLOCK_PART25(); + POLY1305_BLOCK_PART26(); + POLY1305_BLOCK_PART27(); + POLY1305_BLOCK_PART28(); + POLY1305_BLOCK_PART29(); + + subs x2, x2, #1; + b.ne .L_gcry_poly1305_aarch64_loop1; + + POLY1305_STORE_STATE(); + + mov x0, #0; + + POLY1305_POP_REGS(); + ret; + CFI_ENDPROC() +ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;) +#endif + +#endif /* GCRY_ASM_POLY1305_AARCH64_H */ diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S index 07b4bb5c..7ace023f 100644 --- a/cipher/chacha20-aarch64.S +++ b/cipher/chacha20-aarch64.S @@ -1,307 +1,616 @@ /* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function * - * Copyright (C) 2017,2018 Jussi Kivilinna + * Copyright (C) 2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #include "asm-common-aarch64.h" #if defined(__AARCH64EL__) && \ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \ defined(USE_CHACHA20) .cpu generic+simd .text +#include "asm-poly1305-aarch64.h" /* register macros */ #define INPUT x0 #define DST x1 #define SRC x2 #define NBLKS x3 #define ROUND x4 #define INPUT_CTR x5 #define INPUT_POS x6 #define CTR x7 /* vector registers */ #define X0 v16 #define X1 v17 #define X2 v18 #define X3 v19 #define X4 v20 #define X5 v21 #define X6 v22 #define X7 v23 #define X8 v24 #define X9 v25 #define X10 v26 #define X11 v27 #define X12 v28 #define X13 v29 #define X14 v30 #define X15 v31 #define VCTR v0 #define VTMP0 v1 #define VTMP1 v2 #define VTMP2 v3 #define VTMP3 v4 #define X12_TMP v5 #define X13_TMP v6 +#define ROT8 v7 /********************************************************************** helper macros **********************************************************************/ +#define _(...) __VA_ARGS__ + #define vpunpckldq(s1, s2, dst) \ zip1 dst.4s, s2.4s, s1.4s; #define vpunpckhdq(s1, s2, dst) \ zip2 dst.4s, s2.4s, s1.4s; #define vpunpcklqdq(s1, s2, dst) \ zip1 dst.2d, s2.2d, s1.2d; #define vpunpckhqdq(s1, s2, dst) \ zip2 dst.2d, s2.2d, s1.2d; /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ vpunpckhdq(x1, x0, t2); \ vpunpckldq(x1, x0, x0); \ \ vpunpckldq(x3, x2, t1); \ vpunpckhdq(x3, x2, x2); \ \ vpunpckhqdq(t1, x0, x1); \ vpunpcklqdq(t1, x0, x0); \ \ vpunpckhqdq(x2, t2, x3); \ vpunpcklqdq(x2, t2, x2); #define clear(x) \ eor x.16b, x.16b, x.16b; /********************************************************************** 4-way chacha20 **********************************************************************/ -#define ROTATE2(dst1,dst2,c,src1,src2) \ +#define ROTATE2(dst1,dst2,c,src1,src2,iop1) \ shl dst1.4s, src1.4s, #(c); \ shl dst2.4s, src2.4s, #(c); \ + iop1; \ sri dst1.4s, src1.4s, #(32 - (c)); \ sri dst2.4s, src2.4s, #(32 - (c)); +#define ROTATE2_8(dst1,dst2,src1,src2,iop1) \ + tbl dst1.16b, {src1.16b}, ROT8.16b; \ + iop1; \ + tbl dst2.16b, {src2.16b}, ROT8.16b; + #define ROTATE2_16(dst1,dst2,src1,src2) \ rev32 dst1.8h, src1.8h; \ rev32 dst2.8h, src2.8h; #define XOR(d,s1,s2) \ eor d.16b, s2.16b, s1.16b; #define PLUS(ds,s) \ add ds.4s, ds.4s, s.4s; -#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \ - PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \ - ROTATE2_16(d1, d2, tmp1, tmp2); \ - PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \ - ROTATE2(b1, b2, 12, tmp1, tmp2); \ - PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \ - ROTATE2(d1, d2, 8, tmp1, tmp2); \ - PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \ - ROTATE2(b1, b2, 7, tmp1, tmp2); - -chacha20_data: +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,iop1,iop2,iop3,iop4,iop5,iop6,iop7,iop8,iop9,iop10,iop11,iop12,iop13,iop14) \ + PLUS(a1,b1); PLUS(a2,b2); iop1; \ + XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); iop2; \ + ROTATE2_16(d1, d2, tmp1, tmp2); iop3; \ + PLUS(c1,d1); PLUS(c2,d2); iop4; \ + XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); iop5; \ + ROTATE2(b1, b2, 12, tmp1, tmp2, _(iop6)); iop7; \ + PLUS(a1,b1); PLUS(a2,b2); iop8; \ + XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); iop9; \ + ROTATE2_8(d1, d2, tmp1, tmp2, _(iop10)); iop11; \ + PLUS(c1,d1); PLUS(c2,d2); iop12; \ + XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); iop13; \ + ROTATE2(b1, b2, 7, tmp1, tmp2, _(iop14)); + .align 4 -.Linc_counter: +.globl _gcry_chacha20_aarch64_blocks4_data_inc_counter +_gcry_chacha20_aarch64_blocks4_data_inc_counter: .long 0,1,2,3 +.align 4 +.globl _gcry_chacha20_aarch64_blocks4_data_rot8 +_gcry_chacha20_aarch64_blocks4_data_rot8: + .byte 3,0,1,2 + .byte 7,4,5,6 + .byte 11,8,9,10 + .byte 15,12,13,14 + .align 3 .globl _gcry_chacha20_aarch64_blocks4 ELF(.type _gcry_chacha20_aarch64_blocks4,%function;) _gcry_chacha20_aarch64_blocks4: /* input: * x0: input * x1: dst * x2: src * x3: nblks (multiple of 4) */ CFI_STARTPROC() - GET_DATA_POINTER(CTR, .Linc_counter); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); add INPUT_CTR, INPUT, #(12*4); + ld1 {ROT8.16b}, [CTR]; + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); mov INPUT_POS, INPUT; ld1 {VCTR.16b}, [CTR]; .Loop4: /* Construct counter vectors X12 and X13 */ ld1 {X15.16b}, [INPUT_CTR]; mov ROUND, #20; ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS]; dup X12.4s, X15.s[0]; dup X13.4s, X15.s[1]; ldr CTR, [INPUT_CTR]; add X12.4s, X12.4s, VCTR.4s; dup X0.4s, VTMP1.s[0]; dup X1.4s, VTMP1.s[1]; dup X2.4s, VTMP1.s[2]; dup X3.4s, VTMP1.s[3]; dup X14.4s, X15.s[2]; cmhi VTMP0.4s, VCTR.4s, X12.4s; dup X15.4s, X15.s[3]; add CTR, CTR, #4; /* Update counter */ dup X4.4s, VTMP2.s[0]; dup X5.4s, VTMP2.s[1]; dup X6.4s, VTMP2.s[2]; dup X7.4s, VTMP2.s[3]; sub X13.4s, X13.4s, VTMP0.4s; dup X8.4s, VTMP3.s[0]; dup X9.4s, VTMP3.s[1]; dup X10.4s, VTMP3.s[2]; dup X11.4s, VTMP3.s[3]; mov X12_TMP.16b, X12.16b; mov X13_TMP.16b, X13.16b; str CTR, [INPUT_CTR]; .Lround2: subs ROUND, ROUND, #2 - QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1) - QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1) - QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1) - QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1) + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1, + ,,,,,,,,,,,,,) + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1, + ,,,,,,,,,,,,,) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1, + ,,,,,,,,,,,,,) + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1, + ,,,,,,,,,,,,,) b.ne .Lround2; ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32; PLUS(X12, X12_TMP); /* INPUT + 12 * 4 + counter */ PLUS(X13, X13_TMP); /* INPUT + 13 * 4 + counter */ dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */ dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */ dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */ dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */ PLUS(X0, VTMP2); PLUS(X1, VTMP3); PLUS(X2, X12_TMP); PLUS(X3, X13_TMP); dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */ dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */ dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */ dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */ ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS]; mov INPUT_POS, INPUT; PLUS(X4, VTMP2); PLUS(X5, VTMP3); PLUS(X6, X12_TMP); PLUS(X7, X13_TMP); dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */ dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */ dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */ dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */ dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */ dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */ PLUS(X8, VTMP2); PLUS(X9, VTMP3); PLUS(X10, X12_TMP); PLUS(X11, X13_TMP); PLUS(X14, VTMP0); PLUS(X15, VTMP1); transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2); transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2); transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2); transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2); subs NBLKS, NBLKS, #4; ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64; ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32; eor VTMP0.16b, X0.16b, VTMP0.16b; eor VTMP1.16b, X4.16b, VTMP1.16b; eor VTMP2.16b, X8.16b, VTMP2.16b; eor VTMP3.16b, X12.16b, VTMP3.16b; eor X12_TMP.16b, X1.16b, X12_TMP.16b; eor X13_TMP.16b, X5.16b, X13_TMP.16b; st1 {VTMP0.16b-VTMP3.16b}, [DST], #64; ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64; st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32; ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32; eor VTMP0.16b, X9.16b, VTMP0.16b; eor VTMP1.16b, X13.16b, VTMP1.16b; eor VTMP2.16b, X2.16b, VTMP2.16b; eor VTMP3.16b, X6.16b, VTMP3.16b; eor X12_TMP.16b, X10.16b, X12_TMP.16b; eor X13_TMP.16b, X14.16b, X13_TMP.16b; st1 {VTMP0.16b-VTMP3.16b}, [DST], #64; ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64; st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32; eor VTMP0.16b, X3.16b, VTMP0.16b; eor VTMP1.16b, X7.16b, VTMP1.16b; eor VTMP2.16b, X11.16b, VTMP2.16b; eor VTMP3.16b, X15.16b, VTMP3.16b; st1 {VTMP0.16b-VTMP3.16b}, [DST], #64; b.ne .Loop4; /* clear the used vector registers and stack */ clear(VTMP0); clear(VTMP1); clear(VTMP2); clear(VTMP3); clear(X12_TMP); clear(X13_TMP); clear(X0); clear(X1); clear(X2); clear(X3); clear(X4); clear(X5); clear(X6); clear(X7); clear(X8); clear(X9); clear(X10); clear(X11); clear(X12); clear(X13); clear(X14); clear(X15); eor x0, x0, x0 ret CFI_ENDPROC() ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;) +/********************************************************************** + 4-way stitched chacha20-poly1305 + **********************************************************************/ + +.align 3 +.globl _gcry_chacha20_poly1305_aarch64_blocks4 +ELF(.type _gcry_chacha20_poly1305_aarch64_blocks4,%function;) + +_gcry_chacha20_poly1305_aarch64_blocks4: + /* input: + * x0: input + * x1: dst + * x2: src + * x3: nblks (multiple of 4) + * x4: poly1305-state + * x5: poly1305-src + */ + CFI_STARTPROC() + POLY1305_PUSH_REGS() + + mov POLY_RSTATE, x4; + mov POLY_RSRC, x5; + + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); + add INPUT_CTR, INPUT, #(12*4); + ld1 {ROT8.16b}, [CTR]; + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); + mov INPUT_POS, INPUT; + ld1 {VCTR.16b}, [CTR]; + + POLY1305_LOAD_STATE() + +.Loop_poly4: + /* Construct counter vectors X12 and X13 */ + + ld1 {X15.16b}, [INPUT_CTR]; + ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS]; + + dup X12.4s, X15.s[0]; + dup X13.4s, X15.s[1]; + ldr CTR, [INPUT_CTR]; + add X12.4s, X12.4s, VCTR.4s; + dup X0.4s, VTMP1.s[0]; + dup X1.4s, VTMP1.s[1]; + dup X2.4s, VTMP1.s[2]; + dup X3.4s, VTMP1.s[3]; + dup X14.4s, X15.s[2]; + cmhi VTMP0.4s, VCTR.4s, X12.4s; + dup X15.4s, X15.s[3]; + add CTR, CTR, #4; /* Update counter */ + dup X4.4s, VTMP2.s[0]; + dup X5.4s, VTMP2.s[1]; + dup X6.4s, VTMP2.s[2]; + dup X7.4s, VTMP2.s[3]; + sub X13.4s, X13.4s, VTMP0.4s; + dup X8.4s, VTMP3.s[0]; + dup X9.4s, VTMP3.s[1]; + dup X10.4s, VTMP3.s[2]; + dup X11.4s, VTMP3.s[3]; + mov X12_TMP.16b, X12.16b; + mov X13_TMP.16b, X13.16b; + str CTR, [INPUT_CTR]; + + mov ROUND, #20 +.Lround4_with_poly1305_outer: + mov POLY_CHACHA_ROUND, #6; +.Lround4_with_poly1305_inner1: + POLY1305_BLOCK_PART1(0 * 16) + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1, + POLY1305_BLOCK_PART2(0 * 16), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART9(), + POLY1305_BLOCK_PART10(), + POLY1305_BLOCK_PART11(), + POLY1305_BLOCK_PART12(), + POLY1305_BLOCK_PART13(), + POLY1305_BLOCK_PART14(), + POLY1305_BLOCK_PART15()) + POLY1305_BLOCK_PART16() + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1, + POLY1305_BLOCK_PART17(), + POLY1305_BLOCK_PART18(), + POLY1305_BLOCK_PART19(), + POLY1305_BLOCK_PART20(), + POLY1305_BLOCK_PART21(), + POLY1305_BLOCK_PART22(), + POLY1305_BLOCK_PART23(), + POLY1305_BLOCK_PART24(), + POLY1305_BLOCK_PART25(), + POLY1305_BLOCK_PART26(), + POLY1305_BLOCK_PART27(), + POLY1305_BLOCK_PART28(), + POLY1305_BLOCK_PART29(), + POLY1305_BLOCK_PART1(1 * 16)) + POLY1305_BLOCK_PART2(1 * 16) + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1, + _(add POLY_RSRC, POLY_RSRC, #(2*16)), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART9(), + POLY1305_BLOCK_PART10(), + POLY1305_BLOCK_PART11(), + POLY1305_BLOCK_PART12(), + POLY1305_BLOCK_PART13(), + POLY1305_BLOCK_PART14(), + POLY1305_BLOCK_PART15()) + POLY1305_BLOCK_PART16() + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1, + POLY1305_BLOCK_PART17(), + POLY1305_BLOCK_PART18(), + POLY1305_BLOCK_PART19(), + POLY1305_BLOCK_PART20(), + POLY1305_BLOCK_PART21(), + POLY1305_BLOCK_PART22(), + POLY1305_BLOCK_PART23(), + POLY1305_BLOCK_PART24(), + POLY1305_BLOCK_PART25(), + POLY1305_BLOCK_PART26(), + POLY1305_BLOCK_PART27(), + POLY1305_BLOCK_PART28(), + POLY1305_BLOCK_PART29(), + _(subs POLY_CHACHA_ROUND, POLY_CHACHA_ROUND, #2)); + b.ne .Lround4_with_poly1305_inner1; + + mov POLY_CHACHA_ROUND, #4; +.Lround4_with_poly1305_inner2: + POLY1305_BLOCK_PART1(0 * 16) + QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1,, + POLY1305_BLOCK_PART2(0 * 16),, + _(add POLY_RSRC, POLY_RSRC, #(1*16)),, + POLY1305_BLOCK_PART3(),, + POLY1305_BLOCK_PART4(),, + POLY1305_BLOCK_PART5(),, + POLY1305_BLOCK_PART6(),, + POLY1305_BLOCK_PART7()) + QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1, + POLY1305_BLOCK_PART8(),, + POLY1305_BLOCK_PART9(),, + POLY1305_BLOCK_PART10(),, + POLY1305_BLOCK_PART11(),, + POLY1305_BLOCK_PART12(),, + POLY1305_BLOCK_PART13(),, + POLY1305_BLOCK_PART14(),) + POLY1305_BLOCK_PART15() + QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1,, + POLY1305_BLOCK_PART16(),, + POLY1305_BLOCK_PART17(),, + POLY1305_BLOCK_PART18(),, + POLY1305_BLOCK_PART19(),, + POLY1305_BLOCK_PART20(),, + POLY1305_BLOCK_PART21(),, + POLY1305_BLOCK_PART22()) + QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1, + POLY1305_BLOCK_PART23(),, + POLY1305_BLOCK_PART24(),, + POLY1305_BLOCK_PART25(),, + POLY1305_BLOCK_PART26(),, + POLY1305_BLOCK_PART27(),, + POLY1305_BLOCK_PART28(),, + POLY1305_BLOCK_PART29(), + _(subs POLY_CHACHA_ROUND, POLY_CHACHA_ROUND, #2)) + b.ne .Lround4_with_poly1305_inner2; + + subs ROUND, ROUND, #10 + b.ne .Lround4_with_poly1305_outer; + + ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32; + + PLUS(X12, X12_TMP); /* INPUT + 12 * 4 + counter */ + PLUS(X13, X13_TMP); /* INPUT + 13 * 4 + counter */ + + dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */ + dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */ + dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */ + dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */ + PLUS(X0, VTMP2); + PLUS(X1, VTMP3); + PLUS(X2, X12_TMP); + PLUS(X3, X13_TMP); + + dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */ + dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */ + dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */ + dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */ + ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS]; + mov INPUT_POS, INPUT; + PLUS(X4, VTMP2); + PLUS(X5, VTMP3); + PLUS(X6, X12_TMP); + PLUS(X7, X13_TMP); + + dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */ + dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */ + dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */ + dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */ + dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */ + dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */ + PLUS(X8, VTMP2); + PLUS(X9, VTMP3); + PLUS(X10, X12_TMP); + PLUS(X11, X13_TMP); + PLUS(X14, VTMP0); + PLUS(X15, VTMP1); + + transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2); + transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2); + transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2); + transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2); + + subs NBLKS, NBLKS, #4; + + ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64; + ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32; + eor VTMP0.16b, X0.16b, VTMP0.16b; + eor VTMP1.16b, X4.16b, VTMP1.16b; + eor VTMP2.16b, X8.16b, VTMP2.16b; + eor VTMP3.16b, X12.16b, VTMP3.16b; + eor X12_TMP.16b, X1.16b, X12_TMP.16b; + eor X13_TMP.16b, X5.16b, X13_TMP.16b; + st1 {VTMP0.16b-VTMP3.16b}, [DST], #64; + ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64; + st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32; + ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32; + eor VTMP0.16b, X9.16b, VTMP0.16b; + eor VTMP1.16b, X13.16b, VTMP1.16b; + eor VTMP2.16b, X2.16b, VTMP2.16b; + eor VTMP3.16b, X6.16b, VTMP3.16b; + eor X12_TMP.16b, X10.16b, X12_TMP.16b; + eor X13_TMP.16b, X14.16b, X13_TMP.16b; + st1 {VTMP0.16b-VTMP3.16b}, [DST], #64; + ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64; + st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32; + eor VTMP0.16b, X3.16b, VTMP0.16b; + eor VTMP1.16b, X7.16b, VTMP1.16b; + eor VTMP2.16b, X11.16b, VTMP2.16b; + eor VTMP3.16b, X15.16b, VTMP3.16b; + st1 {VTMP0.16b-VTMP3.16b}, [DST], #64; + + b.ne .Loop_poly4; + + POLY1305_STORE_STATE() + + /* clear the used vector registers and stack */ + clear(VTMP0); + clear(VTMP1); + clear(VTMP2); + clear(VTMP3); + clear(X12_TMP); + clear(X13_TMP); + clear(X0); + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X8); + clear(X9); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + clear(X14); + clear(X15); + + eor x0, x0, x0 + POLY1305_POP_REGS() + ret + CFI_ENDPROC() +ELF(.size _gcry_chacha20_poly1305_aarch64_blocks4, .-_gcry_chacha20_poly1305_aarch64_blocks4;) + #endif diff --git a/cipher/chacha20.c b/cipher/chacha20.c index b34d8d19..9d95723b 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -1,1084 +1,1137 @@ /* chacha20.c - Bernstein's ChaCha20 cipher * Copyright (C) 2014,2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * For a description of the algorithm, see: * http://cr.yp.to/chacha.html */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #include #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "cipher-internal.h" #include "bufhelp.h" #define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ #define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ #define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ #define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_CTR_SIZE 16 /* Bytes. */ /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ #undef USE_AVX2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX2 1 #endif /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */ #undef USE_ARMV7_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_ARMV7_NEON 1 # endif #endif /* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly * code. */ #undef USE_AARCH64_SIMD #ifdef ENABLE_NEON_SUPPORT # if defined(__AARCH64EL__) \ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) # define USE_AARCH64_SIMD 1 # endif #endif /* USE_PPC_VEC indicates whether to enable PowerPC vector * accelerated code. */ #undef USE_PPC_VEC #ifdef ENABLE_PPC_CRYPTO_SUPPORT # if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) # if __GNUC__ >= 4 # define USE_PPC_VEC 1 # endif # endif #endif /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) #else # define ASM_FUNC_ABI #endif typedef struct CHACHA20_context_s { u32 input[16]; unsigned char pad[CHACHA20_BLOCK_SIZE]; unsigned int unused; /* bytes in the pad. */ int use_ssse3:1; int use_avx2:1; int use_neon:1; int use_ppc:1; } CHACHA20_context_t; #ifdef USE_SSSE3 unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; #endif /* USE_SSSE3 */ #ifdef USE_AVX2 unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; #endif /* USE_AVX2 */ #ifdef USE_PPC_VEC unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks); #undef USE_PPC_VEC_POLY1305 #if SIZEOF_UNSIGNED_LONG == 8 #define USE_PPC_VEC_POLY1305 1 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); #endif #endif /* USE_PPC_VEC */ #ifdef USE_ARMV7_NEON unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); #endif /* USE_ARMV7_NEON */ #ifdef USE_AARCH64_SIMD unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); +unsigned int _gcry_chacha20_poly1305_aarch64_blocks4( + u32 *state, byte *dst, const byte *src, size_t nblks, + void *poly1305_state, const byte *poly1305_src); + #endif /* USE_AARCH64_SIMD */ static const char *selftest (void); #define ROTATE(v,c) (rol(v,c)) #define XOR(v,w) ((v) ^ (w)) #define PLUS(v,w) ((u32)((v) + (w))) #define PLUSONE(v) (PLUS((v),1)) #define QUARTERROUND(a,b,c,d) \ a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); #define BUF_XOR_LE32(dst, src, offset, x) \ buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x)) static unsigned int do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) { u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; unsigned int i; while (nblks) { x0 = input[0]; x1 = input[1]; x2 = input[2]; x3 = input[3]; x4 = input[4]; x5 = input[5]; x6 = input[6]; x7 = input[7]; x8 = input[8]; x9 = input[9]; x10 = input[10]; x11 = input[11]; x12 = input[12]; x13 = input[13]; x14 = input[14]; x15 = input[15]; for (i = 20; i > 0; i -= 2) { QUARTERROUND(x0, x4, x8, x12) QUARTERROUND(x1, x5, x9, x13) QUARTERROUND(x2, x6, x10, x14) QUARTERROUND(x3, x7, x11, x15) QUARTERROUND(x0, x5, x10, x15) QUARTERROUND(x1, x6, x11, x12) QUARTERROUND(x2, x7, x8, x13) QUARTERROUND(x3, x4, x9, x14) } x0 = PLUS(x0, input[0]); x1 = PLUS(x1, input[1]); x2 = PLUS(x2, input[2]); x3 = PLUS(x3, input[3]); x4 = PLUS(x4, input[4]); x5 = PLUS(x5, input[5]); x6 = PLUS(x6, input[6]); x7 = PLUS(x7, input[7]); x8 = PLUS(x8, input[8]); x9 = PLUS(x9, input[9]); x10 = PLUS(x10, input[10]); x11 = PLUS(x11, input[11]); x12 = PLUS(x12, input[12]); x13 = PLUS(x13, input[13]); x14 = PLUS(x14, input[14]); x15 = PLUS(x15, input[15]); input[12] = PLUSONE(input[12]); input[13] = PLUS(input[13], !input[12]); BUF_XOR_LE32(dst, src, 0, x0); BUF_XOR_LE32(dst, src, 4, x1); BUF_XOR_LE32(dst, src, 8, x2); BUF_XOR_LE32(dst, src, 12, x3); BUF_XOR_LE32(dst, src, 16, x4); BUF_XOR_LE32(dst, src, 20, x5); BUF_XOR_LE32(dst, src, 24, x6); BUF_XOR_LE32(dst, src, 28, x7); BUF_XOR_LE32(dst, src, 32, x8); BUF_XOR_LE32(dst, src, 36, x9); BUF_XOR_LE32(dst, src, 40, x10); BUF_XOR_LE32(dst, src, 44, x11); BUF_XOR_LE32(dst, src, 48, x12); BUF_XOR_LE32(dst, src, 52, x13); BUF_XOR_LE32(dst, src, 56, x14); BUF_XOR_LE32(dst, src, 60, x15); src += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; nblks--; } /* burn_stack */ return (17 * sizeof(u32) + 6 * sizeof(void *)); } static unsigned int chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, size_t nblks) { #ifdef USE_SSSE3 if (ctx->use_ssse3) { return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks); } #endif #ifdef USE_PPC_VEC if (ctx->use_ppc) { return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks); } #endif return do_chacha20_blocks (ctx->input, dst, src, nblks); } static void chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static const char sigma[16] = "expand 32-byte k"; static const char tau[16] = "expand 16-byte k"; const char *constants; ctx->input[4] = buf_get_le32(key + 0); ctx->input[5] = buf_get_le32(key + 4); ctx->input[6] = buf_get_le32(key + 8); ctx->input[7] = buf_get_le32(key + 12); if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */ { key += 16; constants = sigma; } else /* 128 bits */ { constants = tau; } ctx->input[8] = buf_get_le32(key + 0); ctx->input[9] = buf_get_le32(key + 4); ctx->input[10] = buf_get_le32(key + 8); ctx->input[11] = buf_get_le32(key + 12); ctx->input[0] = buf_get_le32(constants + 0); ctx->input[1] = buf_get_le32(constants + 4); ctx->input[2] = buf_get_le32(constants + 8); ctx->input[3] = buf_get_le32(constants + 12); } static void chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen) { if (ivlen == CHACHA20_CTR_SIZE) { ctx->input[12] = buf_get_le32 (iv + 0); ctx->input[13] = buf_get_le32 (iv + 4); ctx->input[14] = buf_get_le32 (iv + 8); ctx->input[15] = buf_get_le32 (iv + 12); } else if (ivlen == CHACHA20_MAX_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = buf_get_le32 (iv + 0); ctx->input[14] = buf_get_le32 (iv + 4); ctx->input[15] = buf_get_le32 (iv + 8); } else if (ivlen == CHACHA20_MIN_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = buf_get_le32 (iv + 0); ctx->input[15] = buf_get_le32 (iv + 4); } else { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = 0; ctx->input[15] = 0; } } static void chacha20_setiv (void *context, const byte *iv, size_t ivlen) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE && ivlen != CHACHA20_CTR_SIZE) log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE || ivlen == CHACHA20_CTR_SIZE)) chacha20_ivsetup (ctx, iv, ivlen); else chacha20_ivsetup (ctx, NULL, 0); /* Reset the unused pad bytes counter. */ ctx->unused = 0; } static gcry_err_code_t chacha20_do_setkey (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static int initialized; static const char *selftest_failed; unsigned int features = _gcry_get_hw_features (); if (!initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) return GPG_ERR_INV_KEYLEN; #ifdef USE_SSSE3 ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif #ifdef USE_AVX2 ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0; #endif #ifdef USE_ARMV7_NEON ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_AARCH64_SIMD ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_PPC_VEC ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; #endif (void)features; chacha20_keysetup (ctx, key, keylen); /* We default to a zero nonce. */ chacha20_setiv (ctx, NULL, 0); return 0; } static gcry_err_code_t chacha20_setkey (void *context, const byte *key, unsigned int keylen, gcry_cipher_hd_t hd) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen); (void)hd; _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); return rc; } static unsigned int do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, const byte *inbuf, size_t length) { static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, }; unsigned int nburn, burn = 0; #ifdef USE_AVX2 if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_ARMV7_NEON if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AARCH64_SIMD if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length > 0) { nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, ctx->pad, length); ctx->unused = CHACHA20_BLOCK_SIZE - length; } if (burn) burn += 5 * sizeof(void *); return burn; } static void chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; unsigned int nburn, burn = 0; if (!length) return; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) return; gcry_assert (!ctx->unused); } nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length); burn = nburn > burn ? nburn : burn; if (burn) _gcry_burn_stack (burn); } gcry_err_code_t _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (void *) &c->context.c; unsigned int nburn, burn = 0; byte *authptr = NULL; if (!length) return 0; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n); burn = nburn > burn ? nburn : burn; length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) { if (burn) _gcry_burn_stack (burn); return 0; } gcry_assert (!ctx->unused); } gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); if (0) { } #ifdef USE_AVX2 else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 8 * CHACHA20_BLOCK_SIZE; outbuf += 8 * CHACHA20_BLOCK_SIZE; inbuf += 8 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 2 * CHACHA20_BLOCK_SIZE; outbuf += 2 * CHACHA20_BLOCK_SIZE; inbuf += 2 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 1 * CHACHA20_BLOCK_SIZE; outbuf += 1 * CHACHA20_BLOCK_SIZE; inbuf += 1 * CHACHA20_BLOCK_SIZE; } #endif +#ifdef USE_AARCH64_SIMD + else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) + { + nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 4 * CHACHA20_BLOCK_SIZE; + outbuf += 4 * CHACHA20_BLOCK_SIZE; + inbuf += 4 * CHACHA20_BLOCK_SIZE; + } +#endif #ifdef USE_PPC_VEC_POLY1305 else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } #endif if (authptr) { size_t authoffset = outbuf - authptr; #ifdef USE_AVX2 if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE && authoffset >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3) { if (length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE && authoffset >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } } #endif +#ifdef USE_AARCH64_SIMD + if (ctx->use_neon && + length >= 4 * CHACHA20_BLOCK_SIZE && + authoffset >= 4 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; + + nburn = _gcry_chacha20_poly1305_aarch64_blocks4( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, authptr); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + authptr += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + #ifdef USE_PPC_VEC_POLY1305 if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_ppc8_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif if (authoffset > 0) { _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset); authptr += authoffset; authoffset = 0; } gcry_assert(authptr == outbuf); } while (length) { size_t currlen = length; /* Since checksumming is done after encryption, process input in 24KiB * chunks to keep data loaded in L1 cache for checksumming. */ if (currlen > 24 * 1024) currlen = 24 * 1024; nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen); burn = nburn > burn ? nburn : burn; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, currlen); burn = nburn > burn ? nburn : burn; outbuf += currlen; inbuf += currlen; length -= currlen; } if (burn) _gcry_burn_stack (burn); return 0; } gcry_err_code_t _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (void *) &c->context.c; unsigned int nburn, burn = 0; if (!length) return 0; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) { if (burn) _gcry_burn_stack (burn); return 0; } gcry_assert (!ctx->unused); } gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); #ifdef USE_AVX2 if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3) { if (length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } } #endif +#ifdef USE_AARCH64_SIMD + if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; + + nburn = _gcry_chacha20_poly1305_aarch64_blocks4( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + #ifdef USE_PPC_VEC_POLY1305 if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_ppc8_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif while (length) { size_t currlen = length; /* Since checksumming is done before decryption, process input in 24KiB * chunks to keep data loaded in L1 cache for decryption. */ if (currlen > 24 * 1024) currlen = 24 * 1024; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, currlen); burn = nburn > burn ? nburn : burn; nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen); burn = nburn > burn ? nburn : burn; outbuf += currlen; inbuf += currlen; length -= currlen; } if (burn) _gcry_burn_stack (burn); return 0; } static const char * selftest (void) { byte ctxbuf[sizeof(CHACHA20_context_t) + 15]; CHACHA20_context_t *ctx; byte scratch[127 + 1]; byte buf[512 + 64 + 4]; int i; /* From draft-strombergson-chacha-test-vectors */ static byte key_1[] = { 0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78, 0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35, 0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb, 0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d }; static const byte nonce_1[] = { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 }; static const byte plaintext_1[127] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const byte ciphertext_1[127] = { 0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9, 0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06, 0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00, 0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf, 0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd, 0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f, 0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f, 0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92, 0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9, 0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36, 0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1, 0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38, 0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea, 0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0, 0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27, 0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33 }; /* 16-byte alignment required for amd64 implementation. */ ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15); chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); scratch[sizeof (scratch) - 1] = 0; chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1); if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) return "ChaCha20 encryption test 1 failed."; if (scratch[sizeof (scratch) - 1]) return "ChaCha20 wrote too much."; chacha20_setkey (ctx, key_1, sizeof (key_1), NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1); if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) return "ChaCha20 decryption test 1 failed."; for (i = 0; i < sizeof buf; i++) buf[i] = i; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /*encrypt */ chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); /*decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, 1); chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1); chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1, buf + (sizeof buf) - 1, 1); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 2 failed."; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /* encrypt */ for (i = 0; i < sizeof buf; i++) chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1); /* decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 3 failed."; return NULL; } gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = { GCRY_CIPHER_CHACHA20, {0, 0}, /* flags */ "CHACHA20", /* name */ NULL, /* aliases */ NULL, /* oids */ 1, /* blocksize in bytes. */ CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */ sizeof (CHACHA20_context_t), chacha20_setkey, NULL, NULL, chacha20_encrypt_stream, chacha20_encrypt_stream, NULL, NULL, chacha20_setiv };