diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index bf13c199..dc63a736 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -1,239 +1,240 @@
# Makefile for cipher modules
# Copyright (C) 1998, 1999, 2000, 2001, 2002,
# 2003, 2009 Free Software Foundation, Inc.
#
# This file is part of Libgcrypt.
#
# Libgcrypt is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation; either version 2.1 of
# the License, or (at your option) any later version.
#
# Libgcrypt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, see .
# Process this file with automake to produce Makefile.in
# Need to include ../src in addition to top_srcdir because gcrypt.h is
# a built header.
AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
AM_CFLAGS = $(GPG_ERROR_CFLAGS)
AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
EXTRA_DIST = gost-s-box.c
CLEANFILES = gost-s-box
DISTCLEANFILES = gost-sb.h
noinst_LTLIBRARIES = libcipher.la
GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
@GCRYPT_DIGESTS@ @GCRYPT_KDFS@
libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
libcipher_la_LIBADD = $(GCRYPT_MODULES)
libcipher_la_SOURCES = \
cipher.c cipher-internal.h \
cipher-cbc.c \
cipher-cfb.c \
cipher-ofb.c \
cipher-ctr.c \
cipher-aeswrap.c \
cipher-ccm.c \
cipher-cmac.c \
cipher-gcm.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
cipher-poly1305.c \
cipher-ocb.c \
cipher-xts.c \
cipher-eax.c \
cipher-selftest.c cipher-selftest.h \
pubkey.c pubkey-internal.h pubkey-util.c \
md.c \
mac.c mac-internal.h \
mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
poly1305.c poly1305-internal.h \
kdf.c kdf-internal.h \
hmac-tests.c \
bithelp.h \
bufhelp.h \
primegen.c \
hash-common.c hash-common.h \
dsa-common.c rsa-common.c \
sha1.h
EXTRA_libcipher_la_SOURCES = \
- asm-common-amd64.h \
asm-common-aarch64.h \
+ asm-common-amd64.h \
+ asm-poly1305-aarch64.h \
asm-poly1305-amd64.h \
arcfour.c arcfour-amd64.S \
blowfish.c blowfish-amd64.S blowfish-arm.S \
cast5.c cast5-amd64.S cast5-arm.S \
chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
chacha20-armv7-neon.S chacha20-aarch64.S \
chacha20-ppc.c \
crc.c crc-intel-pclmul.c crc-armv8-ce.c \
crc-armv8-aarch64-ce.S \
crc-ppc.c \
des.c des-amd64.S \
dsa.c \
elgamal.c \
ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c \
idea.c \
gost28147.c gost.h \
gostr3411-94.c \
md4.c \
md5.c \
rijndael.c rijndael-internal.h rijndael-tables.h \
rijndael-aesni.c rijndael-padlock.c \
rijndael-amd64.S rijndael-arm.S \
rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \
rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \
rijndael-armv8-aarch64-ce.S rijndael-aarch64.S \
rijndael-ppc.c \
rmd160.c \
rsa.c \
salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
scrypt.c \
seed.c \
serpent.c serpent-sse2-amd64.S \
serpent-avx2-amd64.S serpent-armv7-neon.S \
sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
sha256-avx2-bmi2-amd64.S \
sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
sha256-intel-shaext.c sha256-ppc.c \
sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
sha512-avx2-bmi2-amd64.S \
sha512-armv7-neon.S sha512-arm.S \
sha512-ppc.c \
sm3.c \
keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
stribog.c \
tiger.c \
whirlpool.c whirlpool-sse2-amd64.S \
twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
twofish-avx2-amd64.S \
rfc2268.c \
camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
blake2.c \
blake2b-amd64-avx2.S blake2s-amd64-avx.S
gost28147.lo: gost-sb.h
gost-sb.h: gost-s-box
./gost-s-box $@
gost-s-box: gost-s-box.c
$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
$(CPPFLAGS_FOR_BUILD)-o $@ $(srcdir)/gost-s-box.c
if ENABLE_O_FLAG_MUNGING
o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g'
else
o_flag_munging = cat
endif
# We need to lower the optimization for this module.
tiger.o: $(srcdir)/tiger.c Makefile
`echo $(COMPILE) -c $< | $(o_flag_munging) `
tiger.lo: $(srcdir)/tiger.c Makefile
`echo $(LTCOMPILE) -c $< | $(o_flag_munging) `
# We need to disable instrumentation for these modules as they use cc as
# thin assembly front-end and do not tolerate in-between function calls
# inserted by compiler as those functions may clobber the XMM registers.
if ENABLE_INSTRUMENTATION_MUNGING
instrumentation_munging = sed \
-e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
-e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
-e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
else
instrumentation_munging = cat
endif
rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `
rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `
rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `
cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `
sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `
sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `
crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS
ppc_vcrypto_cflags = -maltivec -mvsx -mcrypto
else
ppc_vcrypto_cflags =
endif
rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile
`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile
`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/asm-poly1305-aarch64.h b/cipher/asm-poly1305-aarch64.h
new file mode 100644
index 00000000..6c342bee
--- /dev/null
+++ b/cipher/asm-poly1305-aarch64.h
@@ -0,0 +1,245 @@
+/* asm-common-aarch64.h - Poly1305 macros for ARMv8/AArch64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see .
+ */
+
+#ifndef GCRY_ASM_POLY1305_AARCH64_H
+#define GCRY_ASM_POLY1305_AARCH64_H
+
+#include "asm-common-aarch64.h"
+
+#ifdef __AARCH64EL__
+ #define le_to_host(reg) /*_*/
+#else
+ #define le_to_host(reg) rev reg, reg;
+#endif
+
+/**********************************************************************
+ poly1305 for stitched chacha20-poly1305 Aarch64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE x8
+#define POLY_RSRC x9
+
+#define POLY_R_H0 x10
+#define POLY_R_H1 x11
+#define POLY_R_H2 x12
+#define POLY_R_H2d w12
+#define POLY_R_R0 x13
+#define POLY_R_R1 x14
+#define POLY_R_R1_MUL5 x15
+#define POLY_R_X0_HI x16
+#define POLY_R_X0_LO x17
+#define POLY_R_X1_HI x19
+#define POLY_R_X1_LO x20
+#define POLY_R_ONE x21
+#define POLY_R_ONEd w21
+
+#define POLY_TMP0 x22
+#define POLY_TMP1 x23
+#define POLY_TMP2 x24
+#define POLY_TMP3 x25
+
+#define POLY_CHACHA_ROUND x26
+
+#define POLY_S_R0 (4 * 4 + 0 * 8)
+#define POLY_S_R1 (4 * 4 + 1 * 8)
+#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8)
+#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8)
+#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8)
+
+#define POLY1305_PUSH_REGS() \
+ stp x19, x20, [sp, #-16]!; \
+ CFI_ADJUST_CFA_OFFSET(16); \
+ CFI_REG_ON_STACK(19, 0); \
+ CFI_REG_ON_STACK(20, 8); \
+ stp x21, x22, [sp, #-16]!; \
+ CFI_ADJUST_CFA_OFFSET(16); \
+ CFI_REG_ON_STACK(21, 0); \
+ CFI_REG_ON_STACK(22, 8); \
+ stp x23, x24, [sp, #-16]!; \
+ CFI_ADJUST_CFA_OFFSET(16); \
+ CFI_REG_ON_STACK(23, 0); \
+ CFI_REG_ON_STACK(24, 8); \
+ stp x25, x26, [sp, #-16]!; \
+ CFI_ADJUST_CFA_OFFSET(16); \
+ CFI_REG_ON_STACK(25, 0); \
+ CFI_REG_ON_STACK(26, 8);
+
+#define POLY1305_POP_REGS() \
+ ldp x25, x26, [sp], #16; \
+ CFI_ADJUST_CFA_OFFSET(-16); \
+ CFI_RESTORE(x25); \
+ CFI_RESTORE(x26); \
+ ldp x23, x24, [sp], #16; \
+ CFI_ADJUST_CFA_OFFSET(-16); \
+ CFI_RESTORE(x23); \
+ CFI_RESTORE(x24); \
+ ldp x21, x22, [sp], #16; \
+ CFI_ADJUST_CFA_OFFSET(-16); \
+ CFI_RESTORE(x21); \
+ CFI_RESTORE(x22); \
+ ldp x19, x20, [sp], #16; \
+ CFI_ADJUST_CFA_OFFSET(-16); \
+ CFI_RESTORE(x19); \
+ CFI_RESTORE(x20);
+
+#define POLY1305_LOAD_STATE() \
+ ldr POLY_R_R1, [POLY_RSTATE, #(POLY_S_R1)]; \
+ ldr POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
+ ldr POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
+ ldr POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)]; \
+ ldr POLY_R_R0, [POLY_RSTATE, #(POLY_S_R0)]; \
+ add POLY_R_R1_MUL5, POLY_R_R1, POLY_R_R1, lsr #2; \
+ mov POLY_R_ONE, #1;
+
+#define POLY1305_STORE_STATE() \
+ str POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
+ str POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
+ str POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)];
+
+#define POLY1305_BLOCK_PART1(src_offset) \
+ /* a = h + m */ \
+ ldr POLY_TMP0, [POLY_RSRC, #((src_offset) + 0 * 8)];
+#define POLY1305_BLOCK_PART2(src_offset) \
+ ldr POLY_TMP1, [POLY_RSRC, #((src_offset) + 1 * 8)];
+#define POLY1305_BLOCK_PART3() \
+ le_to_host(POLY_TMP0);
+#define POLY1305_BLOCK_PART4() \
+ le_to_host(POLY_TMP1);
+#define POLY1305_BLOCK_PART5() \
+ adds POLY_R_H0, POLY_R_H0, POLY_TMP0;
+#define POLY1305_BLOCK_PART6() \
+ adcs POLY_R_H1, POLY_R_H1, POLY_TMP1;
+#define POLY1305_BLOCK_PART7() \
+ adc POLY_R_H2d, POLY_R_H2d, POLY_R_ONEd;
+
+#define POLY1305_BLOCK_PART8() \
+ /* h = a * r (partial mod 2^130-5): */ \
+ mul POLY_R_X1_LO, POLY_R_H0, POLY_R_R1; /* lo: h0 * r1 */
+#define POLY1305_BLOCK_PART9() \
+ mul POLY_TMP0, POLY_R_H1, POLY_R_R0; /* lo: h1 * r0 */
+#define POLY1305_BLOCK_PART10() \
+ mul POLY_R_X0_LO, POLY_R_H0, POLY_R_R0; /* lo: h0 * r0 */
+#define POLY1305_BLOCK_PART11() \
+ umulh POLY_R_X1_HI, POLY_R_H0, POLY_R_R1; /* hi: h0 * r1 */
+#define POLY1305_BLOCK_PART12() \
+ adds POLY_R_X1_LO, POLY_R_X1_LO, POLY_TMP0;
+#define POLY1305_BLOCK_PART13() \
+ umulh POLY_TMP1, POLY_R_H1, POLY_R_R0; /* hi: h1 * r0 */
+#define POLY1305_BLOCK_PART14() \
+ mul POLY_TMP2, POLY_R_H1, POLY_R_R1_MUL5; /* lo: h1 * r1 mod 2^130-5 */
+#define POLY1305_BLOCK_PART15() \
+ umulh POLY_R_X0_HI, POLY_R_H0, POLY_R_R0; /* hi: h0 * r0 */
+#define POLY1305_BLOCK_PART16() \
+ adc POLY_R_X1_HI, POLY_R_X1_HI, POLY_TMP1;
+#define POLY1305_BLOCK_PART17() \
+ umulh POLY_TMP3, POLY_R_H1, POLY_R_R1_MUL5; /* hi: h1 * r1 mod 2^130-5 */
+#define POLY1305_BLOCK_PART18() \
+ adds POLY_R_X0_LO, POLY_R_X0_LO, POLY_TMP2;
+#define POLY1305_BLOCK_PART19() \
+ mul POLY_R_H1, POLY_R_H2, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */
+#define POLY1305_BLOCK_PART20() \
+ adc POLY_R_X0_HI, POLY_R_X0_HI, POLY_TMP3;
+#define POLY1305_BLOCK_PART21() \
+ mul POLY_R_H2, POLY_R_H2, POLY_R_R0; /* h2 * r0 */
+#define POLY1305_BLOCK_PART22() \
+ adds POLY_R_H1, POLY_R_H1, POLY_R_X1_LO;
+#define POLY1305_BLOCK_PART23() \
+ adc POLY_R_H0, POLY_R_H2, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART24() \
+ /* carry propagation */ \
+ and POLY_R_H2, POLY_R_H0, #3;
+#define POLY1305_BLOCK_PART25() \
+ mov POLY_R_H0, POLY_R_H0, lsr #2;
+#define POLY1305_BLOCK_PART26() \
+ add POLY_R_H0, POLY_R_H0, POLY_R_H0, lsl #2;
+#define POLY1305_BLOCK_PART27() \
+ adds POLY_R_H0, POLY_R_H0, POLY_R_X0_LO;
+#define POLY1305_BLOCK_PART28() \
+ adcs POLY_R_H1, POLY_R_H1, POLY_R_X0_HI;
+#define POLY1305_BLOCK_PART29() \
+ adc POLY_R_H2d, POLY_R_H2d, wzr;
+
+//#define TESTING_POLY1305_ASM
+#ifdef TESTING_POLY1305_ASM
+/* for testing only. */
+.align 3
+.globl _gcry_poly1305_aarch64_blocks1
+ELF(.type _gcry_poly1305_aarch64_blocks1,%function;)
+_gcry_poly1305_aarch64_blocks1:
+ /* input:
+ * x0: poly1305-state
+ * x1: src
+ * x2: nblks
+ */
+ CFI_STARTPROC()
+ POLY1305_PUSH_REGS();
+
+ mov POLY_RSTATE, x0;
+ mov POLY_RSRC, x1;
+
+ POLY1305_LOAD_STATE();
+
+.L_gcry_poly1305_aarch64_loop1:
+ POLY1305_BLOCK_PART1(0 * 16);
+ POLY1305_BLOCK_PART2(0 * 16);
+ add POLY_RSRC, POLY_RSRC, #16;
+ POLY1305_BLOCK_PART3();
+ POLY1305_BLOCK_PART4();
+ POLY1305_BLOCK_PART5();
+ POLY1305_BLOCK_PART6();
+ POLY1305_BLOCK_PART7();
+ POLY1305_BLOCK_PART8();
+ POLY1305_BLOCK_PART9();
+ POLY1305_BLOCK_PART10();
+ POLY1305_BLOCK_PART11();
+ POLY1305_BLOCK_PART12();
+ POLY1305_BLOCK_PART13();
+ POLY1305_BLOCK_PART14();
+ POLY1305_BLOCK_PART15();
+ POLY1305_BLOCK_PART16();
+ POLY1305_BLOCK_PART17();
+ POLY1305_BLOCK_PART18();
+ POLY1305_BLOCK_PART19();
+ POLY1305_BLOCK_PART20();
+ POLY1305_BLOCK_PART21();
+ POLY1305_BLOCK_PART22();
+ POLY1305_BLOCK_PART23();
+ POLY1305_BLOCK_PART24();
+ POLY1305_BLOCK_PART25();
+ POLY1305_BLOCK_PART26();
+ POLY1305_BLOCK_PART27();
+ POLY1305_BLOCK_PART28();
+ POLY1305_BLOCK_PART29();
+
+ subs x2, x2, #1;
+ b.ne .L_gcry_poly1305_aarch64_loop1;
+
+ POLY1305_STORE_STATE();
+
+ mov x0, #0;
+
+ POLY1305_POP_REGS();
+ ret;
+ CFI_ENDPROC()
+ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;)
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AARCH64_H */
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index 07b4bb5c..7ace023f 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -1,307 +1,616 @@
/* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
*
- * Copyright (C) 2017,2018 Jussi Kivilinna
+ * Copyright (C) 2017-2019 Jussi Kivilinna
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see .
*/
/*
* Based on D. J. Bernstein reference implementation at
* http://cr.yp.to/chacha.html:
*
* chacha-regs.c version 20080118
* D. J. Bernstein
* Public domain.
*/
#include "asm-common-aarch64.h"
#if defined(__AARCH64EL__) && \
defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
defined(USE_CHACHA20)
.cpu generic+simd
.text
+#include "asm-poly1305-aarch64.h"
/* register macros */
#define INPUT x0
#define DST x1
#define SRC x2
#define NBLKS x3
#define ROUND x4
#define INPUT_CTR x5
#define INPUT_POS x6
#define CTR x7
/* vector registers */
#define X0 v16
#define X1 v17
#define X2 v18
#define X3 v19
#define X4 v20
#define X5 v21
#define X6 v22
#define X7 v23
#define X8 v24
#define X9 v25
#define X10 v26
#define X11 v27
#define X12 v28
#define X13 v29
#define X14 v30
#define X15 v31
#define VCTR v0
#define VTMP0 v1
#define VTMP1 v2
#define VTMP2 v3
#define VTMP3 v4
#define X12_TMP v5
#define X13_TMP v6
+#define ROT8 v7
/**********************************************************************
helper macros
**********************************************************************/
+#define _(...) __VA_ARGS__
+
#define vpunpckldq(s1, s2, dst) \
zip1 dst.4s, s2.4s, s1.4s;
#define vpunpckhdq(s1, s2, dst) \
zip2 dst.4s, s2.4s, s1.4s;
#define vpunpcklqdq(s1, s2, dst) \
zip1 dst.2d, s2.2d, s1.2d;
#define vpunpckhqdq(s1, s2, dst) \
zip2 dst.2d, s2.2d, s1.2d;
/* 4x4 32-bit integer matrix transpose */
#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
vpunpckhdq(x1, x0, t2); \
vpunpckldq(x1, x0, x0); \
\
vpunpckldq(x3, x2, t1); \
vpunpckhdq(x3, x2, x2); \
\
vpunpckhqdq(t1, x0, x1); \
vpunpcklqdq(t1, x0, x0); \
\
vpunpckhqdq(x2, t2, x3); \
vpunpcklqdq(x2, t2, x2);
#define clear(x) \
eor x.16b, x.16b, x.16b;
/**********************************************************************
4-way chacha20
**********************************************************************/
-#define ROTATE2(dst1,dst2,c,src1,src2) \
+#define ROTATE2(dst1,dst2,c,src1,src2,iop1) \
shl dst1.4s, src1.4s, #(c); \
shl dst2.4s, src2.4s, #(c); \
+ iop1; \
sri dst1.4s, src1.4s, #(32 - (c)); \
sri dst2.4s, src2.4s, #(32 - (c));
+#define ROTATE2_8(dst1,dst2,src1,src2,iop1) \
+ tbl dst1.16b, {src1.16b}, ROT8.16b; \
+ iop1; \
+ tbl dst2.16b, {src2.16b}, ROT8.16b;
+
#define ROTATE2_16(dst1,dst2,src1,src2) \
rev32 dst1.8h, src1.8h; \
rev32 dst2.8h, src2.8h;
#define XOR(d,s1,s2) \
eor d.16b, s2.16b, s1.16b;
#define PLUS(ds,s) \
add ds.4s, ds.4s, s.4s;
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \
- PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
- ROTATE2_16(d1, d2, tmp1, tmp2); \
- PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
- ROTATE2(b1, b2, 12, tmp1, tmp2); \
- PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
- ROTATE2(d1, d2, 8, tmp1, tmp2); \
- PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
- ROTATE2(b1, b2, 7, tmp1, tmp2);
-
-chacha20_data:
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,iop1,iop2,iop3,iop4,iop5,iop6,iop7,iop8,iop9,iop10,iop11,iop12,iop13,iop14) \
+ PLUS(a1,b1); PLUS(a2,b2); iop1; \
+ XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); iop2; \
+ ROTATE2_16(d1, d2, tmp1, tmp2); iop3; \
+ PLUS(c1,d1); PLUS(c2,d2); iop4; \
+ XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); iop5; \
+ ROTATE2(b1, b2, 12, tmp1, tmp2, _(iop6)); iop7; \
+ PLUS(a1,b1); PLUS(a2,b2); iop8; \
+ XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); iop9; \
+ ROTATE2_8(d1, d2, tmp1, tmp2, _(iop10)); iop11; \
+ PLUS(c1,d1); PLUS(c2,d2); iop12; \
+ XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); iop13; \
+ ROTATE2(b1, b2, 7, tmp1, tmp2, _(iop14));
+
.align 4
-.Linc_counter:
+.globl _gcry_chacha20_aarch64_blocks4_data_inc_counter
+_gcry_chacha20_aarch64_blocks4_data_inc_counter:
.long 0,1,2,3
+.align 4
+.globl _gcry_chacha20_aarch64_blocks4_data_rot8
+_gcry_chacha20_aarch64_blocks4_data_rot8:
+ .byte 3,0,1,2
+ .byte 7,4,5,6
+ .byte 11,8,9,10
+ .byte 15,12,13,14
+
.align 3
.globl _gcry_chacha20_aarch64_blocks4
ELF(.type _gcry_chacha20_aarch64_blocks4,%function;)
_gcry_chacha20_aarch64_blocks4:
/* input:
* x0: input
* x1: dst
* x2: src
* x3: nblks (multiple of 4)
*/
CFI_STARTPROC()
- GET_DATA_POINTER(CTR, .Linc_counter);
+ GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
add INPUT_CTR, INPUT, #(12*4);
+ ld1 {ROT8.16b}, [CTR];
+ GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
mov INPUT_POS, INPUT;
ld1 {VCTR.16b}, [CTR];
.Loop4:
/* Construct counter vectors X12 and X13 */
ld1 {X15.16b}, [INPUT_CTR];
mov ROUND, #20;
ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
dup X12.4s, X15.s[0];
dup X13.4s, X15.s[1];
ldr CTR, [INPUT_CTR];
add X12.4s, X12.4s, VCTR.4s;
dup X0.4s, VTMP1.s[0];
dup X1.4s, VTMP1.s[1];
dup X2.4s, VTMP1.s[2];
dup X3.4s, VTMP1.s[3];
dup X14.4s, X15.s[2];
cmhi VTMP0.4s, VCTR.4s, X12.4s;
dup X15.4s, X15.s[3];
add CTR, CTR, #4; /* Update counter */
dup X4.4s, VTMP2.s[0];
dup X5.4s, VTMP2.s[1];
dup X6.4s, VTMP2.s[2];
dup X7.4s, VTMP2.s[3];
sub X13.4s, X13.4s, VTMP0.4s;
dup X8.4s, VTMP3.s[0];
dup X9.4s, VTMP3.s[1];
dup X10.4s, VTMP3.s[2];
dup X11.4s, VTMP3.s[3];
mov X12_TMP.16b, X12.16b;
mov X13_TMP.16b, X13.16b;
str CTR, [INPUT_CTR];
.Lround2:
subs ROUND, ROUND, #2
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1)
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1)
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1)
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1)
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1,
+ ,,,,,,,,,,,,,)
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1,
+ ,,,,,,,,,,,,,)
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1,
+ ,,,,,,,,,,,,,)
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1,
+ ,,,,,,,,,,,,,)
b.ne .Lround2;
ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
PLUS(X12, X12_TMP); /* INPUT + 12 * 4 + counter */
PLUS(X13, X13_TMP); /* INPUT + 13 * 4 + counter */
dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
PLUS(X0, VTMP2);
PLUS(X1, VTMP3);
PLUS(X2, X12_TMP);
PLUS(X3, X13_TMP);
dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
mov INPUT_POS, INPUT;
PLUS(X4, VTMP2);
PLUS(X5, VTMP3);
PLUS(X6, X12_TMP);
PLUS(X7, X13_TMP);
dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
PLUS(X8, VTMP2);
PLUS(X9, VTMP3);
PLUS(X10, X12_TMP);
PLUS(X11, X13_TMP);
PLUS(X14, VTMP0);
PLUS(X15, VTMP1);
transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
subs NBLKS, NBLKS, #4;
ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
eor VTMP0.16b, X0.16b, VTMP0.16b;
eor VTMP1.16b, X4.16b, VTMP1.16b;
eor VTMP2.16b, X8.16b, VTMP2.16b;
eor VTMP3.16b, X12.16b, VTMP3.16b;
eor X12_TMP.16b, X1.16b, X12_TMP.16b;
eor X13_TMP.16b, X5.16b, X13_TMP.16b;
st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
eor VTMP0.16b, X9.16b, VTMP0.16b;
eor VTMP1.16b, X13.16b, VTMP1.16b;
eor VTMP2.16b, X2.16b, VTMP2.16b;
eor VTMP3.16b, X6.16b, VTMP3.16b;
eor X12_TMP.16b, X10.16b, X12_TMP.16b;
eor X13_TMP.16b, X14.16b, X13_TMP.16b;
st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
eor VTMP0.16b, X3.16b, VTMP0.16b;
eor VTMP1.16b, X7.16b, VTMP1.16b;
eor VTMP2.16b, X11.16b, VTMP2.16b;
eor VTMP3.16b, X15.16b, VTMP3.16b;
st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
b.ne .Loop4;
/* clear the used vector registers and stack */
clear(VTMP0);
clear(VTMP1);
clear(VTMP2);
clear(VTMP3);
clear(X12_TMP);
clear(X13_TMP);
clear(X0);
clear(X1);
clear(X2);
clear(X3);
clear(X4);
clear(X5);
clear(X6);
clear(X7);
clear(X8);
clear(X9);
clear(X10);
clear(X11);
clear(X12);
clear(X13);
clear(X14);
clear(X15);
eor x0, x0, x0
ret
CFI_ENDPROC()
ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;)
+/**********************************************************************
+ 4-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 3
+.globl _gcry_chacha20_poly1305_aarch64_blocks4
+ELF(.type _gcry_chacha20_poly1305_aarch64_blocks4,%function;)
+
+_gcry_chacha20_poly1305_aarch64_blocks4:
+ /* input:
+ * x0: input
+ * x1: dst
+ * x2: src
+ * x3: nblks (multiple of 4)
+ * x4: poly1305-state
+ * x5: poly1305-src
+ */
+ CFI_STARTPROC()
+ POLY1305_PUSH_REGS()
+
+ mov POLY_RSTATE, x4;
+ mov POLY_RSRC, x5;
+
+ GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
+ add INPUT_CTR, INPUT, #(12*4);
+ ld1 {ROT8.16b}, [CTR];
+ GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
+ mov INPUT_POS, INPUT;
+ ld1 {VCTR.16b}, [CTR];
+
+ POLY1305_LOAD_STATE()
+
+.Loop_poly4:
+ /* Construct counter vectors X12 and X13 */
+
+ ld1 {X15.16b}, [INPUT_CTR];
+ ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
+
+ dup X12.4s, X15.s[0];
+ dup X13.4s, X15.s[1];
+ ldr CTR, [INPUT_CTR];
+ add X12.4s, X12.4s, VCTR.4s;
+ dup X0.4s, VTMP1.s[0];
+ dup X1.4s, VTMP1.s[1];
+ dup X2.4s, VTMP1.s[2];
+ dup X3.4s, VTMP1.s[3];
+ dup X14.4s, X15.s[2];
+ cmhi VTMP0.4s, VCTR.4s, X12.4s;
+ dup X15.4s, X15.s[3];
+ add CTR, CTR, #4; /* Update counter */
+ dup X4.4s, VTMP2.s[0];
+ dup X5.4s, VTMP2.s[1];
+ dup X6.4s, VTMP2.s[2];
+ dup X7.4s, VTMP2.s[3];
+ sub X13.4s, X13.4s, VTMP0.4s;
+ dup X8.4s, VTMP3.s[0];
+ dup X9.4s, VTMP3.s[1];
+ dup X10.4s, VTMP3.s[2];
+ dup X11.4s, VTMP3.s[3];
+ mov X12_TMP.16b, X12.16b;
+ mov X13_TMP.16b, X13.16b;
+ str CTR, [INPUT_CTR];
+
+ mov ROUND, #20
+.Lround4_with_poly1305_outer:
+ mov POLY_CHACHA_ROUND, #6;
+.Lround4_with_poly1305_inner1:
+ POLY1305_BLOCK_PART1(0 * 16)
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1,
+ POLY1305_BLOCK_PART2(0 * 16),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8(),
+ POLY1305_BLOCK_PART9(),
+ POLY1305_BLOCK_PART10(),
+ POLY1305_BLOCK_PART11(),
+ POLY1305_BLOCK_PART12(),
+ POLY1305_BLOCK_PART13(),
+ POLY1305_BLOCK_PART14(),
+ POLY1305_BLOCK_PART15())
+ POLY1305_BLOCK_PART16()
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1,
+ POLY1305_BLOCK_PART17(),
+ POLY1305_BLOCK_PART18(),
+ POLY1305_BLOCK_PART19(),
+ POLY1305_BLOCK_PART20(),
+ POLY1305_BLOCK_PART21(),
+ POLY1305_BLOCK_PART22(),
+ POLY1305_BLOCK_PART23(),
+ POLY1305_BLOCK_PART24(),
+ POLY1305_BLOCK_PART25(),
+ POLY1305_BLOCK_PART26(),
+ POLY1305_BLOCK_PART27(),
+ POLY1305_BLOCK_PART28(),
+ POLY1305_BLOCK_PART29(),
+ POLY1305_BLOCK_PART1(1 * 16))
+ POLY1305_BLOCK_PART2(1 * 16)
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1,
+ _(add POLY_RSRC, POLY_RSRC, #(2*16)),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8(),
+ POLY1305_BLOCK_PART9(),
+ POLY1305_BLOCK_PART10(),
+ POLY1305_BLOCK_PART11(),
+ POLY1305_BLOCK_PART12(),
+ POLY1305_BLOCK_PART13(),
+ POLY1305_BLOCK_PART14(),
+ POLY1305_BLOCK_PART15())
+ POLY1305_BLOCK_PART16()
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1,
+ POLY1305_BLOCK_PART17(),
+ POLY1305_BLOCK_PART18(),
+ POLY1305_BLOCK_PART19(),
+ POLY1305_BLOCK_PART20(),
+ POLY1305_BLOCK_PART21(),
+ POLY1305_BLOCK_PART22(),
+ POLY1305_BLOCK_PART23(),
+ POLY1305_BLOCK_PART24(),
+ POLY1305_BLOCK_PART25(),
+ POLY1305_BLOCK_PART26(),
+ POLY1305_BLOCK_PART27(),
+ POLY1305_BLOCK_PART28(),
+ POLY1305_BLOCK_PART29(),
+ _(subs POLY_CHACHA_ROUND, POLY_CHACHA_ROUND, #2));
+ b.ne .Lround4_with_poly1305_inner1;
+
+ mov POLY_CHACHA_ROUND, #4;
+.Lround4_with_poly1305_inner2:
+ POLY1305_BLOCK_PART1(0 * 16)
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,VTMP0,VTMP1,,
+ POLY1305_BLOCK_PART2(0 * 16),,
+ _(add POLY_RSRC, POLY_RSRC, #(1*16)),,
+ POLY1305_BLOCK_PART3(),,
+ POLY1305_BLOCK_PART4(),,
+ POLY1305_BLOCK_PART5(),,
+ POLY1305_BLOCK_PART6(),,
+ POLY1305_BLOCK_PART7())
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,VTMP0,VTMP1,
+ POLY1305_BLOCK_PART8(),,
+ POLY1305_BLOCK_PART9(),,
+ POLY1305_BLOCK_PART10(),,
+ POLY1305_BLOCK_PART11(),,
+ POLY1305_BLOCK_PART12(),,
+ POLY1305_BLOCK_PART13(),,
+ POLY1305_BLOCK_PART14(),)
+ POLY1305_BLOCK_PART15()
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,VTMP0,VTMP1,,
+ POLY1305_BLOCK_PART16(),,
+ POLY1305_BLOCK_PART17(),,
+ POLY1305_BLOCK_PART18(),,
+ POLY1305_BLOCK_PART19(),,
+ POLY1305_BLOCK_PART20(),,
+ POLY1305_BLOCK_PART21(),,
+ POLY1305_BLOCK_PART22())
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,VTMP0,VTMP1,
+ POLY1305_BLOCK_PART23(),,
+ POLY1305_BLOCK_PART24(),,
+ POLY1305_BLOCK_PART25(),,
+ POLY1305_BLOCK_PART26(),,
+ POLY1305_BLOCK_PART27(),,
+ POLY1305_BLOCK_PART28(),,
+ POLY1305_BLOCK_PART29(),
+ _(subs POLY_CHACHA_ROUND, POLY_CHACHA_ROUND, #2))
+ b.ne .Lround4_with_poly1305_inner2;
+
+ subs ROUND, ROUND, #10
+ b.ne .Lround4_with_poly1305_outer;
+
+ ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
+
+ PLUS(X12, X12_TMP); /* INPUT + 12 * 4 + counter */
+ PLUS(X13, X13_TMP); /* INPUT + 13 * 4 + counter */
+
+ dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
+ dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
+ dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
+ dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
+ PLUS(X0, VTMP2);
+ PLUS(X1, VTMP3);
+ PLUS(X2, X12_TMP);
+ PLUS(X3, X13_TMP);
+
+ dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
+ dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
+ dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
+ dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
+ ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
+ mov INPUT_POS, INPUT;
+ PLUS(X4, VTMP2);
+ PLUS(X5, VTMP3);
+ PLUS(X6, X12_TMP);
+ PLUS(X7, X13_TMP);
+
+ dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
+ dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
+ dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
+ dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
+ dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
+ dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
+ PLUS(X8, VTMP2);
+ PLUS(X9, VTMP3);
+ PLUS(X10, X12_TMP);
+ PLUS(X11, X13_TMP);
+ PLUS(X14, VTMP0);
+ PLUS(X15, VTMP1);
+
+ transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
+ transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
+ transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
+ transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
+
+ subs NBLKS, NBLKS, #4;
+
+ ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+ ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+ eor VTMP0.16b, X0.16b, VTMP0.16b;
+ eor VTMP1.16b, X4.16b, VTMP1.16b;
+ eor VTMP2.16b, X8.16b, VTMP2.16b;
+ eor VTMP3.16b, X12.16b, VTMP3.16b;
+ eor X12_TMP.16b, X1.16b, X12_TMP.16b;
+ eor X13_TMP.16b, X5.16b, X13_TMP.16b;
+ st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+ ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+ st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+ ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+ eor VTMP0.16b, X9.16b, VTMP0.16b;
+ eor VTMP1.16b, X13.16b, VTMP1.16b;
+ eor VTMP2.16b, X2.16b, VTMP2.16b;
+ eor VTMP3.16b, X6.16b, VTMP3.16b;
+ eor X12_TMP.16b, X10.16b, X12_TMP.16b;
+ eor X13_TMP.16b, X14.16b, X13_TMP.16b;
+ st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+ ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+ st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+ eor VTMP0.16b, X3.16b, VTMP0.16b;
+ eor VTMP1.16b, X7.16b, VTMP1.16b;
+ eor VTMP2.16b, X11.16b, VTMP2.16b;
+ eor VTMP3.16b, X15.16b, VTMP3.16b;
+ st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+
+ b.ne .Loop_poly4;
+
+ POLY1305_STORE_STATE()
+
+ /* clear the used vector registers and stack */
+ clear(VTMP0);
+ clear(VTMP1);
+ clear(VTMP2);
+ clear(VTMP3);
+ clear(X12_TMP);
+ clear(X13_TMP);
+ clear(X0);
+ clear(X1);
+ clear(X2);
+ clear(X3);
+ clear(X4);
+ clear(X5);
+ clear(X6);
+ clear(X7);
+ clear(X8);
+ clear(X9);
+ clear(X10);
+ clear(X11);
+ clear(X12);
+ clear(X13);
+ clear(X14);
+ clear(X15);
+
+ eor x0, x0, x0
+ POLY1305_POP_REGS()
+ ret
+ CFI_ENDPROC()
+ELF(.size _gcry_chacha20_poly1305_aarch64_blocks4, .-_gcry_chacha20_poly1305_aarch64_blocks4;)
+
#endif
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index b34d8d19..9d95723b 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,1084 +1,1137 @@
/* chacha20.c - Bernstein's ChaCha20 cipher
* Copyright (C) 2014,2017-2019 Jussi Kivilinna
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser general Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see .
*
* For a description of the algorithm, see:
* http://cr.yp.to/chacha.html
*/
/*
* Based on D. J. Bernstein reference implementation at
* http://cr.yp.to/chacha.html:
*
* chacha-regs.c version 20080118
* D. J. Bernstein
* Public domain.
*/
#include
#include
#include
#include
#include "types.h"
#include "g10lib.h"
#include "cipher.h"
#include "cipher-internal.h"
#include "bufhelp.h"
#define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */
#define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */
#define CHACHA20_BLOCK_SIZE 64 /* Bytes. */
#define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */
#define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */
#define CHACHA20_CTR_SIZE 16 /* Bytes. */
/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
#undef USE_SSSE3
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_SSSE3 1
#endif
/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
#undef USE_AVX2
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_AVX2 1
#endif
/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
#undef USE_ARMV7_NEON
#ifdef ENABLE_NEON_SUPPORT
# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
&& defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
&& defined(HAVE_GCC_INLINE_ASM_NEON)
# define USE_ARMV7_NEON 1
# endif
#endif
/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
* code. */
#undef USE_AARCH64_SIMD
#ifdef ENABLE_NEON_SUPPORT
# if defined(__AARCH64EL__) \
&& defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
&& defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
# define USE_AARCH64_SIMD 1
# endif
#endif
/* USE_PPC_VEC indicates whether to enable PowerPC vector
* accelerated code. */
#undef USE_PPC_VEC
#ifdef ENABLE_PPC_CRYPTO_SUPPORT
# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
# if __GNUC__ >= 4
# define USE_PPC_VEC 1
# endif
# endif
#endif
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
#undef ASM_EXTRA_STACK
#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
# define ASM_FUNC_ABI __attribute__((sysv_abi))
#else
# define ASM_FUNC_ABI
#endif
typedef struct CHACHA20_context_s
{
u32 input[16];
unsigned char pad[CHACHA20_BLOCK_SIZE];
unsigned int unused; /* bytes in the pad. */
int use_ssse3:1;
int use_avx2:1;
int use_neon:1;
int use_ppc:1;
} CHACHA20_context_t;
#ifdef USE_SSSE3
unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
const byte *src,
size_t nblks) ASM_FUNC_ABI;
unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
const byte *src,
size_t nblks) ASM_FUNC_ABI;
unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
u32 *state, byte *dst, const byte *src, size_t nblks,
void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
u32 *state, byte *dst, const byte *src, size_t nblks,
void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
#endif /* USE_SSSE3 */
#ifdef USE_AVX2
unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
const byte *src,
size_t nblks) ASM_FUNC_ABI;
unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
u32 *state, byte *dst, const byte *src, size_t nblks,
void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
#endif /* USE_AVX2 */
#ifdef USE_PPC_VEC
unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
const byte *src,
size_t nblks);
unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
const byte *src,
size_t nblks);
#undef USE_PPC_VEC_POLY1305
#if SIZEOF_UNSIGNED_LONG == 8
#define USE_PPC_VEC_POLY1305 1
unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
u32 *state, byte *dst, const byte *src, size_t nblks,
POLY1305_STATE *st, const byte *poly1305_src);
#endif
#endif /* USE_PPC_VEC */
#ifdef USE_ARMV7_NEON
unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
const byte *src,
size_t nblks);
#endif /* USE_ARMV7_NEON */
#ifdef USE_AARCH64_SIMD
unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
const byte *src, size_t nblks);
+unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ void *poly1305_state, const byte *poly1305_src);
+
#endif /* USE_AARCH64_SIMD */
static const char *selftest (void);
#define ROTATE(v,c) (rol(v,c))
#define XOR(v,w) ((v) ^ (w))
#define PLUS(v,w) ((u32)((v) + (w)))
#define PLUSONE(v) (PLUS((v),1))
#define QUARTERROUND(a,b,c,d) \
a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
#define BUF_XOR_LE32(dst, src, offset, x) \
buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
static unsigned int
do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
{
u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
unsigned int i;
while (nblks)
{
x0 = input[0];
x1 = input[1];
x2 = input[2];
x3 = input[3];
x4 = input[4];
x5 = input[5];
x6 = input[6];
x7 = input[7];
x8 = input[8];
x9 = input[9];
x10 = input[10];
x11 = input[11];
x12 = input[12];
x13 = input[13];
x14 = input[14];
x15 = input[15];
for (i = 20; i > 0; i -= 2)
{
QUARTERROUND(x0, x4, x8, x12)
QUARTERROUND(x1, x5, x9, x13)
QUARTERROUND(x2, x6, x10, x14)
QUARTERROUND(x3, x7, x11, x15)
QUARTERROUND(x0, x5, x10, x15)
QUARTERROUND(x1, x6, x11, x12)
QUARTERROUND(x2, x7, x8, x13)
QUARTERROUND(x3, x4, x9, x14)
}
x0 = PLUS(x0, input[0]);
x1 = PLUS(x1, input[1]);
x2 = PLUS(x2, input[2]);
x3 = PLUS(x3, input[3]);
x4 = PLUS(x4, input[4]);
x5 = PLUS(x5, input[5]);
x6 = PLUS(x6, input[6]);
x7 = PLUS(x7, input[7]);
x8 = PLUS(x8, input[8]);
x9 = PLUS(x9, input[9]);
x10 = PLUS(x10, input[10]);
x11 = PLUS(x11, input[11]);
x12 = PLUS(x12, input[12]);
x13 = PLUS(x13, input[13]);
x14 = PLUS(x14, input[14]);
x15 = PLUS(x15, input[15]);
input[12] = PLUSONE(input[12]);
input[13] = PLUS(input[13], !input[12]);
BUF_XOR_LE32(dst, src, 0, x0);
BUF_XOR_LE32(dst, src, 4, x1);
BUF_XOR_LE32(dst, src, 8, x2);
BUF_XOR_LE32(dst, src, 12, x3);
BUF_XOR_LE32(dst, src, 16, x4);
BUF_XOR_LE32(dst, src, 20, x5);
BUF_XOR_LE32(dst, src, 24, x6);
BUF_XOR_LE32(dst, src, 28, x7);
BUF_XOR_LE32(dst, src, 32, x8);
BUF_XOR_LE32(dst, src, 36, x9);
BUF_XOR_LE32(dst, src, 40, x10);
BUF_XOR_LE32(dst, src, 44, x11);
BUF_XOR_LE32(dst, src, 48, x12);
BUF_XOR_LE32(dst, src, 52, x13);
BUF_XOR_LE32(dst, src, 56, x14);
BUF_XOR_LE32(dst, src, 60, x15);
src += CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
nblks--;
}
/* burn_stack */
return (17 * sizeof(u32) + 6 * sizeof(void *));
}
static unsigned int
chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
size_t nblks)
{
#ifdef USE_SSSE3
if (ctx->use_ssse3)
{
return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
}
#endif
#ifdef USE_PPC_VEC
if (ctx->use_ppc)
{
return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
}
#endif
return do_chacha20_blocks (ctx->input, dst, src, nblks);
}
static void
chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
unsigned int keylen)
{
static const char sigma[16] = "expand 32-byte k";
static const char tau[16] = "expand 16-byte k";
const char *constants;
ctx->input[4] = buf_get_le32(key + 0);
ctx->input[5] = buf_get_le32(key + 4);
ctx->input[6] = buf_get_le32(key + 8);
ctx->input[7] = buf_get_le32(key + 12);
if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
{
key += 16;
constants = sigma;
}
else /* 128 bits */
{
constants = tau;
}
ctx->input[8] = buf_get_le32(key + 0);
ctx->input[9] = buf_get_le32(key + 4);
ctx->input[10] = buf_get_le32(key + 8);
ctx->input[11] = buf_get_le32(key + 12);
ctx->input[0] = buf_get_le32(constants + 0);
ctx->input[1] = buf_get_le32(constants + 4);
ctx->input[2] = buf_get_le32(constants + 8);
ctx->input[3] = buf_get_le32(constants + 12);
}
static void
chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
{
if (ivlen == CHACHA20_CTR_SIZE)
{
ctx->input[12] = buf_get_le32 (iv + 0);
ctx->input[13] = buf_get_le32 (iv + 4);
ctx->input[14] = buf_get_le32 (iv + 8);
ctx->input[15] = buf_get_le32 (iv + 12);
}
else if (ivlen == CHACHA20_MAX_IV_SIZE)
{
ctx->input[12] = 0;
ctx->input[13] = buf_get_le32 (iv + 0);
ctx->input[14] = buf_get_le32 (iv + 4);
ctx->input[15] = buf_get_le32 (iv + 8);
}
else if (ivlen == CHACHA20_MIN_IV_SIZE)
{
ctx->input[12] = 0;
ctx->input[13] = 0;
ctx->input[14] = buf_get_le32 (iv + 0);
ctx->input[15] = buf_get_le32 (iv + 4);
}
else
{
ctx->input[12] = 0;
ctx->input[13] = 0;
ctx->input[14] = 0;
ctx->input[15] = 0;
}
}
static void
chacha20_setiv (void *context, const byte *iv, size_t ivlen)
{
CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
/* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
&& ivlen != CHACHA20_CTR_SIZE)
log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
|| ivlen == CHACHA20_CTR_SIZE))
chacha20_ivsetup (ctx, iv, ivlen);
else
chacha20_ivsetup (ctx, NULL, 0);
/* Reset the unused pad bytes counter. */
ctx->unused = 0;
}
static gcry_err_code_t
chacha20_do_setkey (CHACHA20_context_t *ctx,
const byte *key, unsigned int keylen)
{
static int initialized;
static const char *selftest_failed;
unsigned int features = _gcry_get_hw_features ();
if (!initialized)
{
initialized = 1;
selftest_failed = selftest ();
if (selftest_failed)
log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
}
if (selftest_failed)
return GPG_ERR_SELFTEST_FAILED;
if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
return GPG_ERR_INV_KEYLEN;
#ifdef USE_SSSE3
ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
#endif
#ifdef USE_AVX2
ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
#endif
#ifdef USE_ARMV7_NEON
ctx->use_neon = (features & HWF_ARM_NEON) != 0;
#endif
#ifdef USE_AARCH64_SIMD
ctx->use_neon = (features & HWF_ARM_NEON) != 0;
#endif
#ifdef USE_PPC_VEC
ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
#endif
(void)features;
chacha20_keysetup (ctx, key, keylen);
/* We default to a zero nonce. */
chacha20_setiv (ctx, NULL, 0);
return 0;
}
static gcry_err_code_t
chacha20_setkey (void *context, const byte *key, unsigned int keylen,
gcry_cipher_hd_t hd)
{
CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
(void)hd;
_gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
return rc;
}
static unsigned int
do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
const byte *inbuf, size_t length)
{
static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
unsigned int nburn, burn = 0;
#ifdef USE_AVX2
if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 8;
nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
nblocks);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
#ifdef USE_SSSE3
if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
nblocks);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
#ifdef USE_ARMV7_NEON
if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
nblocks);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
#ifdef USE_AARCH64_SIMD
if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
nblocks);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
#ifdef USE_PPC_VEC
if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
if (length >= CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
if (length > 0)
{
nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
burn = nburn > burn ? nburn : burn;
buf_xor (outbuf, inbuf, ctx->pad, length);
ctx->unused = CHACHA20_BLOCK_SIZE - length;
}
if (burn)
burn += 5 * sizeof(void *);
return burn;
}
static void
chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
size_t length)
{
CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
unsigned int nburn, burn = 0;
if (!length)
return;
if (ctx->unused)
{
unsigned char *p = ctx->pad;
size_t n;
gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
n = ctx->unused;
if (n > length)
n = length;
buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
length -= n;
outbuf += n;
inbuf += n;
ctx->unused -= n;
if (!length)
return;
gcry_assert (!ctx->unused);
}
nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
burn = nburn > burn ? nburn : burn;
if (burn)
_gcry_burn_stack (burn);
}
gcry_err_code_t
_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
const byte *inbuf, size_t length)
{
CHACHA20_context_t *ctx = (void *) &c->context.c;
unsigned int nburn, burn = 0;
byte *authptr = NULL;
if (!length)
return 0;
if (ctx->unused)
{
unsigned char *p = ctx->pad;
size_t n;
gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
n = ctx->unused;
if (n > length)
n = length;
buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
burn = nburn > burn ? nburn : burn;
length -= n;
outbuf += n;
inbuf += n;
ctx->unused -= n;
if (!length)
{
if (burn)
_gcry_burn_stack (burn);
return 0;
}
gcry_assert (!ctx->unused);
}
gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
if (0)
{ }
#ifdef USE_AVX2
else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
{
nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
burn = nburn > burn ? nburn : burn;
authptr = outbuf;
length -= 8 * CHACHA20_BLOCK_SIZE;
outbuf += 8 * CHACHA20_BLOCK_SIZE;
inbuf += 8 * CHACHA20_BLOCK_SIZE;
}
#endif
#ifdef USE_SSSE3
else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
{
nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
burn = nburn > burn ? nburn : burn;
authptr = outbuf;
length -= 4 * CHACHA20_BLOCK_SIZE;
outbuf += 4 * CHACHA20_BLOCK_SIZE;
inbuf += 4 * CHACHA20_BLOCK_SIZE;
}
else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2)
{
nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2);
burn = nburn > burn ? nburn : burn;
authptr = outbuf;
length -= 2 * CHACHA20_BLOCK_SIZE;
outbuf += 2 * CHACHA20_BLOCK_SIZE;
inbuf += 2 * CHACHA20_BLOCK_SIZE;
}
else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
{
nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
burn = nburn > burn ? nburn : burn;
authptr = outbuf;
length -= 1 * CHACHA20_BLOCK_SIZE;
outbuf += 1 * CHACHA20_BLOCK_SIZE;
inbuf += 1 * CHACHA20_BLOCK_SIZE;
}
#endif
+#ifdef USE_AARCH64_SIMD
+ else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 4 * CHACHA20_BLOCK_SIZE;
+ outbuf += 4 * CHACHA20_BLOCK_SIZE;
+ inbuf += 4 * CHACHA20_BLOCK_SIZE;
+ }
+#endif
#ifdef USE_PPC_VEC_POLY1305
else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
{
nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
burn = nburn > burn ? nburn : burn;
authptr = outbuf;
length -= 4 * CHACHA20_BLOCK_SIZE;
outbuf += 4 * CHACHA20_BLOCK_SIZE;
inbuf += 4 * CHACHA20_BLOCK_SIZE;
}
#endif
if (authptr)
{
size_t authoffset = outbuf - authptr;
#ifdef USE_AVX2
if (ctx->use_avx2 &&
length >= 8 * CHACHA20_BLOCK_SIZE &&
authoffset >= 8 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 8;
nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, authptr);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
authptr += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
#ifdef USE_SSSE3
if (ctx->use_ssse3)
{
if (length >= 4 * CHACHA20_BLOCK_SIZE &&
authoffset >= 4 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, authptr);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
authptr += nblocks * CHACHA20_BLOCK_SIZE;
}
if (length >= CHACHA20_BLOCK_SIZE &&
authoffset >= CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, authptr);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
authptr += nblocks * CHACHA20_BLOCK_SIZE;
}
}
#endif
+#ifdef USE_AARCH64_SIMD
+ if (ctx->use_neon &&
+ length >= 4 * CHACHA20_BLOCK_SIZE &&
+ authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+
+ nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
#ifdef USE_PPC_VEC_POLY1305
if (ctx->use_ppc &&
length >= 4 * CHACHA20_BLOCK_SIZE &&
authoffset >= 4 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, authptr);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
authptr += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
if (authoffset > 0)
{
_gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
authptr += authoffset;
authoffset = 0;
}
gcry_assert(authptr == outbuf);
}
while (length)
{
size_t currlen = length;
/* Since checksumming is done after encryption, process input in 24KiB
* chunks to keep data loaded in L1 cache for checksumming. */
if (currlen > 24 * 1024)
currlen = 24 * 1024;
nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
burn = nburn > burn ? nburn : burn;
nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
currlen);
burn = nburn > burn ? nburn : burn;
outbuf += currlen;
inbuf += currlen;
length -= currlen;
}
if (burn)
_gcry_burn_stack (burn);
return 0;
}
gcry_err_code_t
_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
const byte *inbuf, size_t length)
{
CHACHA20_context_t *ctx = (void *) &c->context.c;
unsigned int nburn, burn = 0;
if (!length)
return 0;
if (ctx->unused)
{
unsigned char *p = ctx->pad;
size_t n;
gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
n = ctx->unused;
if (n > length)
n = length;
nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
burn = nburn > burn ? nburn : burn;
buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
length -= n;
outbuf += n;
inbuf += n;
ctx->unused -= n;
if (!length)
{
if (burn)
_gcry_burn_stack (burn);
return 0;
}
gcry_assert (!ctx->unused);
}
gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
#ifdef USE_AVX2
if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 8;
nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, inbuf);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
#ifdef USE_SSSE3
if (ctx->use_ssse3)
{
if (length >= 4 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, inbuf);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
if (length >= CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, inbuf);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
}
#endif
+#ifdef USE_AARCH64_SIMD
+ if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+
+ nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
#ifdef USE_PPC_VEC_POLY1305
if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, inbuf);
burn = nburn > burn ? nburn : burn;
length -= nblocks * CHACHA20_BLOCK_SIZE;
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
#endif
while (length)
{
size_t currlen = length;
/* Since checksumming is done before decryption, process input in 24KiB
* chunks to keep data loaded in L1 cache for decryption. */
if (currlen > 24 * 1024)
currlen = 24 * 1024;
nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
currlen);
burn = nburn > burn ? nburn : burn;
nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
burn = nburn > burn ? nburn : burn;
outbuf += currlen;
inbuf += currlen;
length -= currlen;
}
if (burn)
_gcry_burn_stack (burn);
return 0;
}
static const char *
selftest (void)
{
byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
CHACHA20_context_t *ctx;
byte scratch[127 + 1];
byte buf[512 + 64 + 4];
int i;
/* From draft-strombergson-chacha-test-vectors */
static byte key_1[] = {
0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
};
static const byte nonce_1[] =
{ 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
static const byte plaintext_1[127] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
static const byte ciphertext_1[127] = {
0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
};
/* 16-byte alignment required for amd64 implementation. */
ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
scratch[sizeof (scratch) - 1] = 0;
chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
return "ChaCha20 encryption test 1 failed.";
if (scratch[sizeof (scratch) - 1])
return "ChaCha20 wrote too much.";
chacha20_setkey (ctx, key_1, sizeof (key_1), NULL);
chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
return "ChaCha20 decryption test 1 failed.";
for (i = 0; i < sizeof buf; i++)
buf[i] = i;
chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
/*encrypt */
chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
/*decrypt */
chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
chacha20_encrypt_stream (ctx, buf, buf, 1);
chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
buf + (sizeof buf) - 1, 1);
for (i = 0; i < sizeof buf; i++)
if (buf[i] != (byte) i)
return "ChaCha20 encryption test 2 failed.";
chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
/* encrypt */
for (i = 0; i < sizeof buf; i++)
chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
/* decrypt */
chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
for (i = 0; i < sizeof buf; i++)
if (buf[i] != (byte) i)
return "ChaCha20 encryption test 3 failed.";
return NULL;
}
gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
GCRY_CIPHER_CHACHA20,
{0, 0}, /* flags */
"CHACHA20", /* name */
NULL, /* aliases */
NULL, /* oids */
1, /* blocksize in bytes. */
CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */
sizeof (CHACHA20_context_t),
chacha20_setkey,
NULL,
NULL,
chacha20_encrypt_stream,
chacha20_encrypt_stream,
NULL,
NULL,
chacha20_setiv
};