diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 08baa7c4..a24b117c 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -1,130 +1,129 @@
 # Makefile for cipher modules
 # Copyright (C) 1998, 1999, 2000, 2001, 2002,
 #               2003, 2009 Free Software Foundation, Inc.
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # Process this file with automake to produce Makefile.in
 
 # Need to include ../src in addition to top_srcdir because gcrypt.h is
 # a built header.
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 
 AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
 
 EXTRA_DIST = gost-s-box.c
 
 CLEANFILES = gost-s-box
 DISTCLEANFILES = gost-sb.h
 
 noinst_LTLIBRARIES = libcipher.la
 
 GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
                  @GCRYPT_DIGESTS@ @GCRYPT_KDFS@
 
 libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
 libcipher_la_LIBADD = $(GCRYPT_MODULES)
 
 libcipher_la_SOURCES = \
 cipher.c cipher-internal.h \
 cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
 cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \
   cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
 cipher-poly1305.c cipher-ocb.c cipher-xts.c \
 cipher-selftest.c cipher-selftest.h \
 pubkey.c pubkey-internal.h pubkey-util.c \
 md.c \
 mac.c mac-internal.h \
 mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
 poly1305.c poly1305-internal.h \
 kdf.c kdf-internal.h \
 hmac-tests.c \
 bithelp.h  \
 bufhelp.h  \
 primegen.c  \
 hash-common.c hash-common.h \
 dsa-common.c rsa-common.c \
 sha1.h
 
 EXTRA_libcipher_la_SOURCES = \
 arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
-chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
-  chacha20-armv7-neon.S \
+chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S chacha20-armv7-neon.S \
 crc.c \
   crc-intel-pclmul.c \
 des.c des-amd64.S \
 dsa.c \
 elgamal.c \
 ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
 ecc-ecdsa.c ecc-eddsa.c ecc-gost.c \
 idea.c \
 gost28147.c gost.h \
 gostr3411-94.c \
 md4.c \
 md5.c \
 rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
   rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \
   rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \
   rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S rijndael-armv8-aarch64-ce.S \
   rijndael-aarch64.S \
 rmd160.c \
 rsa.c \
 salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
 scrypt.c \
 seed.c \
 serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S serpent-armv7-neon.S \
 sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
   sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
   sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
   sha512-armv7-neon.S sha512-arm.S \
 sm3.c \
 keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 stribog.c \
 tiger.c \
 whirlpool.c whirlpool-sse2-amd64.S \
 twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
   twofish-avx2-amd64.S \
 rfc2268.c \
 camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
   camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
 blake2.c
 
 gost28147.lo: gost-sb.h
 gost-sb.h: gost-s-box
 	./gost-s-box $@
 
 gost-s-box: gost-s-box.c
 	$(CC_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c
 
 
 if ENABLE_O_FLAG_MUNGING
 o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g'
 else
 o_flag_munging = cat
 endif
 
 
 # We need to lower the optimization for this module.
 tiger.o: $(srcdir)/tiger.c
 	`echo $(COMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
 
 tiger.lo: $(srcdir)/tiger.c
 	`echo $(LTCOMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
new file mode 100644
index 00000000..dad9e3e9
--- /dev/null
+++ b/cipher/chacha20-amd64-avx2.S
@@ -0,0 +1,323 @@
+/* chacha20-amd64-avx2.S  -  AVX2 implementation of ChaCha20 cipher
+ *
+
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (32)
+#define STACK_VEC_X13 (32 + STACK_VEC_X12)
+#define STACK_TMP     (32 + STACK_VEC_X13)
+#define STACK_TMP1    (32 + STACK_TMP)
+#define STACK_TMP2    (32 + STACK_TMP1)
+
+#define STACK_MAX     (32 + STACK_TMP2)
+
+/* vector registers */
+#define X0 %ymm0
+#define X1 %ymm1
+#define X2 %ymm2
+#define X3 %ymm3
+#define X4 %ymm4
+#define X5 %ymm5
+#define X6 %ymm6
+#define X7 %ymm7
+#define X8 %ymm8
+#define X9 %ymm9
+#define X10 %ymm10
+#define X11 %ymm11
+#define X12 %ymm12
+#define X13 %ymm13
+#define X14 %ymm14
+#define X15 %ymm15
+
+#define X0h %xmm0
+#define X1h %xmm1
+#define X2h %xmm2
+#define X3h %xmm3
+#define X4h %xmm4
+#define X5h %xmm5
+#define X6h %xmm6
+#define X7h %xmm7
+#define X8h %xmm8
+#define X9h %xmm9
+#define X10h %xmm10
+#define X11h %xmm11
+#define X12h %xmm12
+#define X13h %xmm13
+#define X14h %xmm14
+#define X15h %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1,	x0, x1; \
+	vpunpcklqdq t1,	x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2,	t2, x2;
+
+/**********************************************************************
+  8-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp)	\
+	vpsrld $(32 - (c)), v1, tmp;	\
+	vpslld $(c), v1, v1;		\
+	vpaddb tmp, v1, v1;		\
+	vpsrld $(32 - (c)), v2, tmp;	\
+	vpslld $(c), v2, v2;		\
+	vpaddb tmp, v2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	vpshufb shuf, v1, v1;		\
+	vpshufb shuf, v2, v2;
+
+#define XOR(ds,s) \
+	vpxor s, ds, ds;
+
+#define PLUS(ds,s) \
+	vpaddd s, ds, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1)		\
+	vbroadcasti128 .Lshuf_rol16 RIP, tmp1;			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1);				\
+	vbroadcasti128 .Lshuf_rol8 RIP, tmp1;			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1);
+
+#define BUF_XOR_256_TO_128(dst, src, offset_lo, offset_hi, yreg, tmp1)	\
+	vextracti128 $1, yreg, tmp1##h;					\
+	vpxor offset_lo(src), yreg##h, yreg##h;				\
+	vpxor offset_hi(src), tmp1##h, tmp1##h;				\
+	vmovdqu yreg##h, offset_lo(dst);				\
+	vmovdqu tmp1##h, offset_hi(dst);
+
+.align 32
+chacha20_data:
+.Lshuf_rol16:
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_rol8:
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Linc_counter:
+	.byte 0,1,2,3,4,5,6,7
+.Lunsigned_cmp:
+	.long 0x80000000
+
+.align 8
+.globl _gcry_chacha20_amd64_avx2_blocks8
+ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;)
+
+_gcry_chacha20_amd64_avx2_blocks8:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 8)
+	 */
+
+	vzeroupper;
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $STACK_MAX, %rsp;
+	andq $~31, %rsp;
+
+.Loop4:
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	vpmovzxbd .Linc_counter RIP, X0;
+	vpbroadcastd .Lunsigned_cmp RIP, X2;
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd X0, X12, X12;
+	vpxor X2, X0, X0;
+	vpxor X2, X12, X1;
+	vpcmpgtd X1, X0, X0;
+	vpsubd X0, X13, X13;
+	vmovdqa X12, (STACK_VEC_X12)(%rsp);
+	vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	vpbroadcastd (0 * 4)(INPUT), X0;
+	vpbroadcastd (1 * 4)(INPUT), X1;
+	vpbroadcastd (2 * 4)(INPUT), X2;
+	vpbroadcastd (3 * 4)(INPUT), X3;
+	vpbroadcastd (4 * 4)(INPUT), X4;
+	vpbroadcastd (5 * 4)(INPUT), X5;
+	vpbroadcastd (6 * 4)(INPUT), X6;
+	vpbroadcastd (7 * 4)(INPUT), X7;
+	vpbroadcastd (8 * 4)(INPUT), X8;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vpbroadcastd (14 * 4)(INPUT), X14;
+	vpbroadcastd (15 * 4)(INPUT), X15;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+
+.Lround2:
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15)
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8)
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15)
+	sub $2, ROUND;
+	jnz .Lround2;
+
+	/* tmp := X15 */
+	vpbroadcastd (0 * 4)(INPUT), X15;
+	PLUS(X0, X15);
+	vpbroadcastd (1 * 4)(INPUT), X15;
+	PLUS(X1, X15);
+	vpbroadcastd (2 * 4)(INPUT), X15;
+	PLUS(X2, X15);
+	vpbroadcastd (3 * 4)(INPUT), X15;
+	PLUS(X3, X15);
+	vpbroadcastd (4 * 4)(INPUT), X15;
+	PLUS(X4, X15);
+	vpbroadcastd (5 * 4)(INPUT), X15;
+	PLUS(X5, X15);
+	vpbroadcastd (6 * 4)(INPUT), X15;
+	PLUS(X6, X15);
+	vpbroadcastd (7 * 4)(INPUT), X15;
+	PLUS(X7, X15);
+	vpbroadcastd (8 * 4)(INPUT), X15;
+	PLUS(X8, X15);
+	vpbroadcastd (9 * 4)(INPUT), X15;
+	PLUS(X9, X15);
+	vpbroadcastd (10 * 4)(INPUT), X15;
+	PLUS(X10, X15);
+	vpbroadcastd (11 * 4)(INPUT), X15;
+	PLUS(X11, X15);
+	vmovdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	vmovdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X13, (STACK_TMP)(%rsp);
+	vpbroadcastd (14 * 4)(INPUT), X13;
+	PLUS(X14, X13);
+	vmovdqa X14, (STACK_TMP1)(%rsp);
+	vpbroadcastd (15 * 4)(INPUT), X13;
+	PLUS(X15, X13);
+	vmovdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14);
+	transpose_4x4(X4, X5, X6, X7, X13, X14);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
+	vmovdqa (STACK_TMP)(%rsp), X13;
+	vmovdqa (STACK_TMP1)(%rsp), X14;
+	vmovdqa (STACK_TMP2)(%rsp), X15;
+	transpose_4x4(X8, X9, X10, X11, X0, X1);
+	transpose_4x4(X12, X13, X14, X15, X0, X1);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+
+	sub $8, NBLKS;
+	lea (8 * 64)(DST), DST;
+	lea (8 * 64)(SRC), SRC;
+	jnz .Loop4;
+
+	/* clear the used vector registers and stack */
+	vpxor X0, X0, X0;
+	vmovdqa X0, (STACK_VEC_X12)(%rsp);
+	vmovdqa X0, (STACK_VEC_X13)(%rsp);
+	vmovdqa X0, (STACK_TMP)(%rsp);
+	vmovdqa X0, (STACK_TMP1)(%rsp);
+	vmovdqa X0, (STACK_TMP2)(%rsp);
+	vzeroall;
+
+	/* eax zeroed by round loop. */
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
+	  .-_gcry_chacha20_amd64_avx2_blocks8;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
new file mode 100644
index 00000000..7ad1c0ae
--- /dev/null
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -0,0 +1,341 @@
+/* chacha20-amd64-ssse3.S  -  SSSE3 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (16 + STACK_VEC_X12)
+#define STACK_TMP     (16 + STACK_VEC_X13)
+#define STACK_TMP1    (16 + STACK_TMP)
+#define STACK_TMP2    (16 + STACK_TMP1)
+
+#define STACK_MAX     (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 %xmm0
+#define X1 %xmm1
+#define X2 %xmm2
+#define X3 %xmm3
+#define X4 %xmm4
+#define X5 %xmm5
+#define X6 %xmm6
+#define X7 %xmm7
+#define X8 %xmm8
+#define X9 %xmm9
+#define X10 %xmm10
+#define X11 %xmm11
+#define X12 %xmm12
+#define X13 %xmm13
+#define X14 %xmm14
+#define X15 %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa    x0, t2; \
+	punpckhdq x1, t2; \
+	punpckldq x1, x0; \
+	\
+	movdqa    x2, t1; \
+	punpckldq x3, t1; \
+	punpckhdq x3, x2; \
+	\
+	movdqa     x0, x1; \
+	punpckhqdq t1, x1; \
+	punpcklqdq t1, x0; \
+	\
+	movdqa     t2, x3; \
+	punpckhqdq x2, x3; \
+	punpcklqdq x2, t2; \
+	movdqa     t2, x2;
+
+/* fill xmm register with 32-bit value from memory */
+#define pbroadcastd(mem32, xreg) \
+	movd mem32, xreg; \
+	pshufd $0, xreg, xreg;
+
+/* xor with unaligned memory operand */
+#define pxor_u(umem128, xreg, t) \
+	movdqu umem128, t; \
+	pxor t, xreg;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg, t) \
+	pxor_u(offset(src), xreg, t); \
+	movdqu xreg, offset(dst);
+
+#define clear(x) pxor x,x;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp1,tmp2)	\
+	movdqa v1, tmp1; 		\
+	movdqa v2, tmp2; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;			\
+	psrld $(32 - (c)), v2;		\
+	pslld $(c), tmp2;		\
+	paddb tmp2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	pshufb shuf, v1;		\
+	pshufb shuf, v2;
+
+#define XOR(ds,s) \
+	pxor s, ds;
+
+#define PLUS(ds,s) \
+	paddd s, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)	\
+	movdqa .Lshuf_rol16 RIP, tmp1;				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
+	movdqa .Lshuf_rol8 RIP, tmp1;				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 16
+.Lshuf_rol16:
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_rol8:
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Linc_counter:
+	.long 0,1,2,3
+.Lunsigned_cmp:
+	.long 0x80000000,0x80000000,0x80000000,0x80000000
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks4
+ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;)
+
+_gcry_chacha20_amd64_ssse3_blocks4:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 4)
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $STACK_MAX, %rsp;
+	andq $~15, %rsp;
+
+.Loop4:
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	vmovdqa .Linc_counter RIP, X0;
+	vmovdqa .Lunsigned_cmp RIP, X2;
+	pbroadcastd((12 * 4)(INPUT), X12);
+	pbroadcastd((13 * 4)(INPUT), X13);
+	paddd X0, X12;
+	movdqa X12, X1;
+	pxor X2, X0;
+	pxor X2, X1;
+	pcmpgtd X1, X0;
+	psubd X0, X13;
+	movdqa X12, (STACK_VEC_X12)(%rsp);
+	movdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	pbroadcastd((0 * 4)(INPUT), X0);
+	pbroadcastd((1 * 4)(INPUT), X1);
+	pbroadcastd((2 * 4)(INPUT), X2);
+	pbroadcastd((3 * 4)(INPUT), X3);
+	pbroadcastd((4 * 4)(INPUT), X4);
+	pbroadcastd((5 * 4)(INPUT), X5);
+	pbroadcastd((6 * 4)(INPUT), X6);
+	pbroadcastd((7 * 4)(INPUT), X7);
+	pbroadcastd((8 * 4)(INPUT), X8);
+	pbroadcastd((9 * 4)(INPUT), X9);
+	pbroadcastd((10 * 4)(INPUT), X10);
+	pbroadcastd((11 * 4)(INPUT), X11);
+	pbroadcastd((14 * 4)(INPUT), X14);
+	pbroadcastd((15 * 4)(INPUT), X15);
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+
+.Lround2:
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
+	sub $2, ROUND;
+	jnz .Lround2;
+
+	/* tmp := X15 */
+	movdqa (STACK_TMP)(%rsp), X11;
+	pbroadcastd((0 * 4)(INPUT), X15);
+	PLUS(X0, X15);
+	pbroadcastd((1 * 4)(INPUT), X15);
+	PLUS(X1, X15);
+	pbroadcastd((2 * 4)(INPUT), X15);
+	PLUS(X2, X15);
+	pbroadcastd((3 * 4)(INPUT), X15);
+	PLUS(X3, X15);
+	pbroadcastd((4 * 4)(INPUT), X15);
+	PLUS(X4, X15);
+	pbroadcastd((5 * 4)(INPUT), X15);
+	PLUS(X5, X15);
+	pbroadcastd((6 * 4)(INPUT), X15);
+	PLUS(X6, X15);
+	pbroadcastd((7 * 4)(INPUT), X15);
+	PLUS(X7, X15);
+	pbroadcastd((8 * 4)(INPUT), X15);
+	PLUS(X8, X15);
+	pbroadcastd((9 * 4)(INPUT), X15);
+	PLUS(X9, X15);
+	pbroadcastd((10 * 4)(INPUT), X15);
+	PLUS(X10, X15);
+	pbroadcastd((11 * 4)(INPUT), X15);
+	PLUS(X11, X15);
+	movdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	movdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	movdqa X13, (STACK_TMP)(%rsp);
+	pbroadcastd((14 * 4)(INPUT), X15);
+	PLUS(X14, X15);
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X14, (STACK_TMP1)(%rsp);
+	pbroadcastd((15 * 4)(INPUT), X13);
+	PLUS(X15, X13);
+	movdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+	movdqa (STACK_TMP)(%rsp), X13;
+	movdqa (STACK_TMP1)(%rsp), X14;
+	movdqa (STACK_TMP2)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+	sub $4, NBLKS;
+	lea (4 * 64)(DST), DST;
+	lea (4 * 64)(SRC), SRC;
+	jnz .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	movdqa X0, (STACK_VEC_X12)(%rsp);
+	movdqa X0, (STACK_VEC_X13)(%rsp);
+	movdqa X0, (STACK_TMP)(%rsp);
+	movdqa X0, (STACK_TMP1)(%rsp);
+	movdqa X0, (STACK_TMP2)(%rsp);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	/* eax zeroed by round loop. */
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
+	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
index c1971fc7..33a43df1 100644
--- a/cipher/chacha20-armv7-neon.S
+++ b/cipher/chacha20-armv7-neon.S
@@ -1,750 +1,393 @@
-/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function
+/* chacha20-armv7-neon.S  -  ARMv7 NEON implementation of ChaCha20 cipher
  *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
  */
 
 #include <config.h>
 
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-    defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20)
+    defined(HAVE_GCC_INLINE_ASM_NEON)
 
 .syntax unified
 .fpu neon
 .arm
 
-#define UNALIGNED_STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0-d3}; \
-        vmov s0, l0; \
-        vmov s1, l1; \
-        vmov s2, l2; \
-        vmov s3, l3; \
-        vmov s4, l4; \
-        vmov s5, l5; \
-        vmov s6, l6; \
-        vmov s7, l7; \
-        vst1.32 {d0-d3}, [ptr]; \
-        add ptr, #32; \
-        vpop {d0-d3}; \
-        b 2f; \
-     1: stmia ptr!, {l0-l7}; \
-     2: ;
-
-#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0-d1}; \
-        vld1.32 {d0-d1}, [ptr]; \
-        add ptr, #16; \
-        vmov l0, s0; \
-        vmov l1, s1; \
-        vmov l2, s2; \
-        vmov l3, s3; \
-        vpop {d0-d1}; \
-        b 2f; \
-     1: ldmia ptr!, {l0-l3}; \
-     2: ;
-
 .text
 
-.globl _gcry_chacha20_armv7_neon_blocks
-.type  _gcry_chacha20_armv7_neon_blocks,%function;
-_gcry_chacha20_armv7_neon_blocks:
-.Lchacha_blocks_neon_local:
-	tst r3, r3
-	beq .Lchacha_blocks_neon_nobytes
-	vstmdb sp!, {q4,q5,q6,q7}
-	stmfd sp!, {r4-r12, r14}
-	mov r8, sp
-	sub sp, sp, #196
-	and sp, sp, #0xffffffe0
-	str r0, [sp, #60]
-	str r1, [sp, #48]
-	str r2, [sp, #40]
-	str r3, [sp, #52]
-	str r8, [sp, #192]
-	add r1, sp, #64
-	ldmia r0!, {r4-r11}
-	stmia r1!, {r4-r11}
-	ldmia r0!, {r4-r11}
-	stmia r1!, {r4-r11}
-	mov r4, #20
-	str r4, [sp, #44]
-	cmp r3, #256
-	blo .Lchacha_blocks_neon_mainloop2
-.Lchacha_blocks_neon_mainloop1:
-	ldr r0, [sp, #44]
-	str r0, [sp, #0]
-	add r1, sp, #(64)
-	mov r2, #1
-	veor q12, q12
-	vld1.32 {q0,q1}, [r1,:128]!
-	vld1.32 {q2,q3}, [r1,:128]
-	vmov.32 d24[0], r2
-	vadd.u64 q3, q3, q12
-	vmov q4, q0
-	vmov q5, q1
-	vmov q6, q2
-	vadd.u64 q7, q3, q12
-	vmov q8, q0
-	vmov q9, q1
-	vmov q10, q2
-	vadd.u64 q11, q7, q12
-	add r0, sp, #64
-	ldm r0, {r0-r12}
-	ldr r14, [sp, #(64 +60)]
-	str r6, [sp, #8]
-	str r11, [sp, #12]
-	str r14, [sp, #28]
-	ldr r11, [sp, #(64 +52)]
-	ldr r14, [sp, #(64 +56)]
-.Lchacha_blocks_neon_rounds1:
-	ldr r6, [sp, #0]
-	vadd.i32 q0, q0, q1
-	add r0, r0, r4
-	vadd.i32 q4, q4, q5
-	add r1, r1, r5
-	vadd.i32 q8, q8, q9
-	eor r12, r12, r0
-	veor q12, q3, q0
-	eor r11, r11, r1
-	veor q13, q7, q4
-	ror r12, r12, #16
-	veor q14, q11, q8
-	ror r11, r11, #16
-	vrev32.16 q3, q12
-	subs r6, r6, #2
-	vrev32.16 q7, q13
-	add r8, r8, r12
-	vrev32.16 q11, q14
-	add r9, r9, r11
-	vadd.i32 q2, q2, q3
-	eor r4, r4, r8
-	vadd.i32 q6, q6, q7
-	eor r5, r5, r9
-	vadd.i32 q10, q10, q11
-	str r6, [sp, #0]
-	veor q12, q1, q2
-	ror r4, r4, #20
-	veor q13, q5, q6
-	ror r5, r5, #20
-	veor q14, q9, q10
-	add r0, r0, r4
-	vshl.i32 q1, q12, #12
-	add r1, r1, r5
-	vshl.i32 q5, q13, #12
-	ldr r6, [sp, #8]
-	vshl.i32 q9, q14, #12
-	eor r12, r12, r0
-	vsri.u32 q1, q12, #20
-	eor r11, r11, r1
-	vsri.u32 q5, q13, #20
-	ror r12, r12, #24
-	vsri.u32 q9, q14, #20
-	ror r11, r11, #24
-	vadd.i32 q0, q0, q1
-	add r8, r8, r12
-	vadd.i32 q4, q4, q5
-	add r9, r9, r11
-	vadd.i32 q8, q8, q9
-	eor r4, r4, r8
-	veor q12, q3, q0
-	eor r5, r5, r9
-	veor q13, q7, q4
-	str r11, [sp, #20]
-	veor q14, q11, q8
-	ror r4, r4, #25
-	vshl.i32 q3, q12, #8
-	ror r5, r5, #25
-	vshl.i32 q7, q13, #8
-	str r4, [sp, #4]
-	vshl.i32 q11, q14, #8
-	ldr r4, [sp, #28]
-	vsri.u32 q3, q12, #24
-	add r2, r2, r6
-	vsri.u32 q7, q13, #24
-	add r3, r3, r7
-	vsri.u32 q11, q14, #24
-	ldr r11, [sp, #12]
-	vadd.i32 q2, q2, q3
-	eor r14, r14, r2
-	vadd.i32 q6, q6, q7
-	eor r4, r4, r3
-	vadd.i32 q10, q10, q11
-	ror r14, r14, #16
-	veor q12, q1, q2
-	ror r4, r4, #16
-	veor q13, q5, q6
-	add r10, r10, r14
-	veor q14, q9, q10
-	add r11, r11, r4
-	vshl.i32 q1, q12, #7
-	eor r6, r6, r10
-	vshl.i32 q5, q13, #7
-	eor r7, r7, r11
-	vshl.i32 q9, q14, #7
-	ror r6, r6, #20
-	vsri.u32 q1, q12, #25
-	ror r7, r7, #20
-	vsri.u32 q5, q13, #25
-	add r2, r2, r6
-	vsri.u32 q9, q14, #25
-	add r3, r3, r7
-	vext.32 q3, q3, q3, #3
-	eor r14, r14, r2
-	vext.32 q7, q7, q7, #3
-	eor r4, r4, r3
-	vext.32 q11, q11, q11, #3
-	ror r14, r14, #24
-	vext.32 q1, q1, q1, #1
-	ror r4, r4, #24
-	vext.32 q5, q5, q5, #1
-	add r10, r10, r14
-	vext.32 q9, q9, q9, #1
-	add r11, r11, r4
-	vext.32 q2, q2, q2, #2
-	eor r6, r6, r10
-	vext.32 q6, q6, q6, #2
-	eor r7, r7, r11
-	vext.32 q10, q10, q10, #2
-	ror r6, r6, #25
-	vadd.i32 q0, q0, q1
-	ror r7, r7, #25
-	vadd.i32 q4, q4, q5
-	add r0, r0, r5
-	vadd.i32 q8, q8, q9
-	add r1, r1, r6
-	veor q12, q3, q0
-	eor r4, r4, r0
-	veor q13, q7, q4
-	eor r12, r12, r1
-	veor q14, q11, q8
-	ror r4, r4, #16
-	vrev32.16 q3, q12
-	ror r12, r12, #16
-	vrev32.16 q7, q13
-	add r10, r10, r4
-	vrev32.16 q11, q14
-	add r11, r11, r12
-	vadd.i32 q2, q2, q3
-	eor r5, r5, r10
-	vadd.i32 q6, q6, q7
-	eor r6, r6, r11
-	vadd.i32 q10, q10, q11
-	ror r5, r5, #20
-	veor q12, q1, q2
-	ror r6, r6, #20
-	veor q13, q5, q6
-	add r0, r0, r5
-	veor q14, q9, q10
-	add r1, r1, r6
-	vshl.i32 q1, q12, #12
-	eor r4, r4, r0
-	vshl.i32 q5, q13, #12
-	eor r12, r12, r1
-	vshl.i32 q9, q14, #12
-	ror r4, r4, #24
-	vsri.u32 q1, q12, #20
-	ror r12, r12, #24
-	vsri.u32 q5, q13, #20
-	add r10, r10, r4
-	vsri.u32 q9, q14, #20
-	add r11, r11, r12
-	vadd.i32 q0, q0, q1
-	eor r5, r5, r10
-	vadd.i32 q4, q4, q5
-	eor r6, r6, r11
-	vadd.i32 q8, q8, q9
-	str r11, [sp, #12]
-	veor q12, q3, q0
-	ror r5, r5, #25
-	veor q13, q7, q4
-	ror r6, r6, #25
-	veor q14, q11, q8
-	str r4, [sp, #28]
-	vshl.i32 q3, q12, #8
-	ldr r4, [sp, #4]
-	vshl.i32 q7, q13, #8
-	add r2, r2, r7
-	vshl.i32 q11, q14, #8
-	add r3, r3, r4
-	vsri.u32 q3, q12, #24
-	ldr r11, [sp, #20]
-	vsri.u32 q7, q13, #24
-	eor r11, r11, r2
-	vsri.u32 q11, q14, #24
-	eor r14, r14, r3
-	vadd.i32 q2, q2, q3
-	ror r11, r11, #16
-	vadd.i32 q6, q6, q7
-	ror r14, r14, #16
-	vadd.i32 q10, q10, q11
-	add r8, r8, r11
-	veor q12, q1, q2
-	add r9, r9, r14
-	veor q13, q5, q6
-	eor r7, r7, r8
-	veor q14, q9, q10
-	eor r4, r4, r9
-	vshl.i32 q1, q12, #7
-	ror r7, r7, #20
-	vshl.i32 q5, q13, #7
-	ror r4, r4, #20
-	vshl.i32 q9, q14, #7
-	str r6, [sp, #8]
-	vsri.u32 q1, q12, #25
-	add r2, r2, r7
-	vsri.u32 q5, q13, #25
-	add r3, r3, r4
-	vsri.u32 q9, q14, #25
-	eor r11, r11, r2
-	vext.32 q3, q3, q3, #1
-	eor r14, r14, r3
-	vext.32 q7, q7, q7, #1
-	ror r11, r11, #24
-	vext.32 q11, q11, q11, #1
-	ror r14, r14, #24
-	vext.32 q1, q1, q1, #3
-	add r8, r8, r11
-	vext.32 q5, q5, q5, #3
-	add r9, r9, r14
-	vext.32 q9, q9, q9, #3
-	eor r7, r7, r8
-	vext.32 q2, q2, q2, #2
-	eor r4, r4, r9
-	vext.32 q6, q6, q6, #2
-	ror r7, r7, #25
-	vext.32 q10, q10, q10, #2
-	ror r4, r4, #25
-	bne .Lchacha_blocks_neon_rounds1
-	str r8, [sp, #0]
-	str r9, [sp, #4]
-	str r10, [sp, #8]
-	str r12, [sp, #16]
-	str r11, [sp, #20]
-	str r14, [sp, #24]
-	add r9, sp, #64
-	vld1.32 {q12,q13}, [r9,:128]!
-	ldr r12, [sp, #48]
-	vld1.32 {q14,q15}, [r9,:128]
-	ldr r14, [sp, #40]
-	vadd.i32 q0, q0, q12
-	ldr r8, [sp, #(64 +0)]
-	vadd.i32 q4, q4, q12
-	ldr r9, [sp, #(64 +4)]
-	vadd.i32 q8, q8, q12
-	ldr r10, [sp, #(64 +8)]
-	vadd.i32 q1, q1, q13
-	ldr r11, [sp, #(64 +12)]
-	vadd.i32 q5, q5, q13
-	add r0, r0, r8
-	vadd.i32 q9, q9, q13
-	add r1, r1, r9
-	vadd.i32 q2, q2, q14
-	add r2, r2, r10
-	vadd.i32 q6, q6, q14
-	ldr r8, [sp, #(64 +16)]
-	vadd.i32 q10, q10, q14
-	add r3, r3, r11
-	veor q14, q14, q14
-	ldr r9, [sp, #(64 +20)]
-	mov r11, #1
-	add r4, r4, r8
-	vmov.32 d28[0], r11
-	ldr r10, [sp, #(64 +24)]
-	vadd.u64 q12, q14, q15
-	add r5, r5, r9
-	vadd.u64 q13, q14, q12
-	ldr r11, [sp, #(64 +28)]
-	vadd.u64 q14, q14, q13
-	add r6, r6, r10
-	vadd.i32 q3, q3, q12
-	tst r12, r12
-	vadd.i32 q7, q7, q13
-	add r7, r7, r11
-	vadd.i32 q11, q11, q14
-	beq .Lchacha_blocks_neon_nomessage11
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage11:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	tst r12, r12
-	ldm sp, {r0-r7}
-	ldr r8, [sp, #(64 +32)]
-	ldr r9, [sp, #(64 +36)]
-	ldr r10, [sp, #(64 +40)]
-	ldr r11, [sp, #(64 +44)]
-	add r0, r0, r8
-	add r1, r1, r9
-	add r2, r2, r10
-	ldr r8, [sp, #(64 +48)]
-	add r3, r3, r11
-	ldr r9, [sp, #(64 +52)]
-	add r4, r4, r8
-	ldr r10, [sp, #(64 +56)]
-	add r5, r5, r9
-	ldr r11, [sp, #(64 +60)]
-	add r6, r6, r10
-	adds r8, r8, #4
-	add r7, r7, r11
-	adc r9, r9, #0
-	str r8, [sp, #(64 +48)]
-	tst r12, r12
-	str r9, [sp, #(64 +52)]
-	beq .Lchacha_blocks_neon_nomessage12
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage12:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	tst r12, r12
-	beq .Lchacha_blocks_neon_nomessage13
-	vld1.32 {q12,q13}, [r12]!
-	vld1.32 {q14,q15}, [r12]!
-	veor q0, q0, q12
-	veor q1, q1, q13
-	veor q2, q2, q14
-	veor q3, q3, q15
-.Lchacha_blocks_neon_nomessage13:
-	vst1.32 {q0,q1}, [r14]!
-	vst1.32 {q2,q3}, [r14]!
-	beq .Lchacha_blocks_neon_nomessage14
-	vld1.32 {q12,q13}, [r12]!
-	vld1.32 {q14,q15}, [r12]!
-	veor q4, q4, q12
-	veor q5, q5, q13
-	veor q6, q6, q14
-	veor q7, q7, q15
-.Lchacha_blocks_neon_nomessage14:
-	vst1.32 {q4,q5}, [r14]!
-	vst1.32 {q6,q7}, [r14]!
-	beq .Lchacha_blocks_neon_nomessage15
-	vld1.32 {q12,q13}, [r12]!
-	vld1.32 {q14,q15}, [r12]!
-	veor q8, q8, q12
-	veor q9, q9, q13
-	veor q10, q10, q14
-	veor q11, q11, q15
-.Lchacha_blocks_neon_nomessage15:
-	vst1.32 {q8,q9}, [r14]!
-	vst1.32 {q10,q11}, [r14]!
-	str r12, [sp, #48]
-	str r14, [sp, #40]
-	ldr r3, [sp, #52]
-	sub r3, r3, #256
-	cmp r3, #256
-	str r3, [sp, #52]
-	bhs .Lchacha_blocks_neon_mainloop1
-	tst r3, r3
-	beq .Lchacha_blocks_neon_done
-.Lchacha_blocks_neon_mainloop2:
-	ldr r3, [sp, #52]
-	ldr r1, [sp, #48]
-	cmp r3, #64
-	bhs .Lchacha_blocks_neon_noswap1
-	add r4, sp, #128
-	mov r5, r4
-	tst r1, r1
-	beq .Lchacha_blocks_neon_nocopy1
-.Lchacha_blocks_neon_copyinput1:
-	subs r3, r3, #1
-	ldrb r0, [r1], #1
-	strb r0, [r4], #1
-	bne .Lchacha_blocks_neon_copyinput1
-	str r5, [sp, #48]
-.Lchacha_blocks_neon_nocopy1:
-	ldr r4, [sp, #40]
-	str r5, [sp, #40]
-	str r4, [sp, #56]
-.Lchacha_blocks_neon_noswap1:
-	ldr r0, [sp, #44]
-	str r0, [sp, #0]
-	add r0, sp, #64
-	ldm r0, {r0-r12}
-	ldr r14, [sp, #(64 +60)]
-	str r6, [sp, #8]
-	str r11, [sp, #12]
-	str r14, [sp, #28]
-	ldr r11, [sp, #(64 +52)]
-	ldr r14, [sp, #(64 +56)]
-.Lchacha_blocks_neon_rounds2:
-	ldr r6, [sp, #0]
-	add r0, r0, r4
-	add r1, r1, r5
-	eor r12, r12, r0
-	eor r11, r11, r1
-	ror r12, r12, #16
-	ror r11, r11, #16
-	subs r6, r6, #2
-	add r8, r8, r12
-	add r9, r9, r11
-	eor r4, r4, r8
-	eor r5, r5, r9
-	str r6, [sp, #0]
-	ror r4, r4, #20
-	ror r5, r5, #20
-	add r0, r0, r4
-	add r1, r1, r5
-	ldr r6, [sp, #8]
-	eor r12, r12, r0
-	eor r11, r11, r1
-	ror r12, r12, #24
-	ror r11, r11, #24
-	add r8, r8, r12
-	add r9, r9, r11
-	eor r4, r4, r8
-	eor r5, r5, r9
-	str r11, [sp, #20]
-	ror r4, r4, #25
-	ror r5, r5, #25
-	str r4, [sp, #4]
-	ldr r4, [sp, #28]
-	add r2, r2, r6
-	add r3, r3, r7
-	ldr r11, [sp, #12]
-	eor r14, r14, r2
-	eor r4, r4, r3
-	ror r14, r14, #16
-	ror r4, r4, #16
-	add r10, r10, r14
-	add r11, r11, r4
-	eor r6, r6, r10
-	eor r7, r7, r11
-	ror r6, r6, #20
-	ror r7, r7, #20
-	add r2, r2, r6
-	add r3, r3, r7
-	eor r14, r14, r2
-	eor r4, r4, r3
-	ror r14, r14, #24
-	ror r4, r4, #24
-	add r10, r10, r14
-	add r11, r11, r4
-	eor r6, r6, r10
-	eor r7, r7, r11
-	ror r6, r6, #25
-	ror r7, r7, #25
-	add r0, r0, r5
-	add r1, r1, r6
-	eor r4, r4, r0
-	eor r12, r12, r1
-	ror r4, r4, #16
-	ror r12, r12, #16
-	add r10, r10, r4
-	add r11, r11, r12
-	eor r5, r5, r10
-	eor r6, r6, r11
-	ror r5, r5, #20
-	ror r6, r6, #20
-	add r0, r0, r5
-	add r1, r1, r6
-	eor r4, r4, r0
-	eor r12, r12, r1
-	ror r4, r4, #24
-	ror r12, r12, #24
-	add r10, r10, r4
-	add r11, r11, r12
-	eor r5, r5, r10
-	eor r6, r6, r11
-	str r11, [sp, #12]
-	ror r5, r5, #25
-	ror r6, r6, #25
-	str r4, [sp, #28]
-	ldr r4, [sp, #4]
-	add r2, r2, r7
-	add r3, r3, r4
-	ldr r11, [sp, #20]
-	eor r11, r11, r2
-	eor r14, r14, r3
-	ror r11, r11, #16
-	ror r14, r14, #16
-	add r8, r8, r11
-	add r9, r9, r14
-	eor r7, r7, r8
-	eor r4, r4, r9
-	ror r7, r7, #20
-	ror r4, r4, #20
-	str r6, [sp, #8]
-	add r2, r2, r7
-	add r3, r3, r4
-	eor r11, r11, r2
-	eor r14, r14, r3
-	ror r11, r11, #24
-	ror r14, r14, #24
-	add r8, r8, r11
-	add r9, r9, r14
-	eor r7, r7, r8
-	eor r4, r4, r9
-	ror r7, r7, #25
-	ror r4, r4, #25
-	bne .Lchacha_blocks_neon_rounds2
-	str r8, [sp, #0]
-	str r9, [sp, #4]
-	str r10, [sp, #8]
-	str r12, [sp, #16]
-	str r11, [sp, #20]
-	str r14, [sp, #24]
-	ldr r12, [sp, #48]
-	ldr r14, [sp, #40]
-	ldr r8, [sp, #(64 +0)]
-	ldr r9, [sp, #(64 +4)]
-	ldr r10, [sp, #(64 +8)]
-	ldr r11, [sp, #(64 +12)]
-	add r0, r0, r8
-	add r1, r1, r9
-	add r2, r2, r10
-	ldr r8, [sp, #(64 +16)]
-	add r3, r3, r11
-	ldr r9, [sp, #(64 +20)]
-	add r4, r4, r8
-	ldr r10, [sp, #(64 +24)]
-	add r5, r5, r9
-	ldr r11, [sp, #(64 +28)]
-	add r6, r6, r10
-	tst r12, r12
-	add r7, r7, r11
-	beq .Lchacha_blocks_neon_nomessage21
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage21:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	ldm sp, {r0-r7}
-	ldr r8, [sp, #(64 +32)]
-	ldr r9, [sp, #(64 +36)]
-	ldr r10, [sp, #(64 +40)]
-	ldr r11, [sp, #(64 +44)]
-	add r0, r0, r8
-	add r1, r1, r9
-	add r2, r2, r10
-	ldr r8, [sp, #(64 +48)]
-	add r3, r3, r11
-	ldr r9, [sp, #(64 +52)]
-	add r4, r4, r8
-	ldr r10, [sp, #(64 +56)]
-	add r5, r5, r9
-	ldr r11, [sp, #(64 +60)]
-	add r6, r6, r10
-	adds r8, r8, #1
-	add r7, r7, r11
-	adc r9, r9, #0
-	str r8, [sp, #(64 +48)]
-	tst r12, r12
-	str r9, [sp, #(64 +52)]
-	beq .Lchacha_blocks_neon_nomessage22
-	UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
-	tst r12, r12
-	eor r0, r0, r8
-	eor r1, r1, r9
-	eor r2, r2, r10
-	ldr r8, [r12, #0]
-	eor r3, r3, r11
-	ldr r9, [r12, #4]
-	eor r4, r4, r8
-	ldr r10, [r12, #8]
-	eor r5, r5, r9
-	ldr r11, [r12, #12]
-	eor r6, r6, r10
-	add r12, r12, #16
-	eor r7, r7, r11
-.Lchacha_blocks_neon_nomessage22:
-	UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
-	str r12, [sp, #48]
-	str r14, [sp, #40]
-	ldr r3, [sp, #52]
-	cmp r3, #64
-	sub r4, r3, #64
-	str r4, [sp, #52]
-	bhi .Lchacha_blocks_neon_mainloop2
-	cmp r3, #64
-	beq .Lchacha_blocks_neon_nocopy2
-	ldr r1, [sp, #56]
-	sub r14, r14, #64
-.Lchacha_blocks_neon_copyinput2:
-	subs r3, r3, #1
-	ldrb r0, [r14], #1
-	strb r0, [r1], #1
-	bne .Lchacha_blocks_neon_copyinput2
-.Lchacha_blocks_neon_nocopy2:
-.Lchacha_blocks_neon_done:
-	ldr r7, [sp, #60]
-	ldr r8, [sp, #(64 +48)]
-	ldr r9, [sp, #(64 +52)]
-	str r8, [r7, #(48 + 0)]
-	str r9, [r7, #(48 + 4)]
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+/* register macros */
+#define INPUT r0
+#define DST   r1
+#define SRC   r2
+#define NBLKS r3
+#define ROUND r4
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (STACK_VEC_X12 + 16)
+#define STACK_TMP     (STACK_VEC_X13 + 16)
+#define STACK_TMP1    (16 + STACK_TMP)
+#define STACK_TMP2    (16 + STACK_TMP1)
+
+#define STACK_MAX     (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 q0
+#define X1 q1
+#define X2 q2
+#define X3 q3
+#define X4 q4
+#define X5 q5
+#define X6 q6
+#define X7 q7
+#define X8 q8
+#define X9 q9
+#define X10 q10
+#define X11 q11
+#define X12 q12
+#define X13 q13
+#define X14 q14
+#define X15 q15
+
+#define X0l d0
+#define X1l d2
+#define X2l d4
+#define X3l d6
+#define X4l d8
+#define X5l d10
+#define X6l d12
+#define X7l d14
+#define X8l d16
+#define X9l d18
+#define X10l d20
+#define X11l d22
+#define X12l d24
+#define X13l d26
+#define X14l d28
+#define X15l d30
+
+#define X0h d1
+#define X1h d3
+#define X2h d5
+#define X3h d7
+#define X4h d9
+#define X5h d11
+#define X6h d13
+#define X7h d15
+#define X8h d17
+#define X9h d19
+#define X10h d21
+#define X11h d23
+#define X12h d25
+#define X13h d27
+#define X14h d29
+#define X15h d31
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4_part1(_q0, _q1, _q2, _q3)	\
+	vtrn.32 _q0, _q1;			\
+	vtrn.32 _q2, _q3;
+#define transpose_4x4_part2(_q0, _q1, _q2, _q3)	\
+	vswp _q0##h, _q2##l;			\
+	vswp _q1##h, _q3##l;
+
+#define clear(x) veor x,x,x;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(dst1,dst2,c,src1,src2)		\
+	vshl.u32 dst1, src1, #(c);		\
+	vshl.u32 dst2, src2, #(c);		\
+	vsri.u32 dst1, src1, #(32 - (c));	\
+	vsri.u32 dst2, src2, #(32 - (c));
+
+#define ROTATE2_16(dst1,dst2,src1,src2)		\
+	vrev32.16 dst1, src1;			\
+	vrev32.16 dst2, src2;
+
+#define XOR(d,s1,s2) \
+	veor d, s2, s1;
+
+#define PLUS(ds,s) \
+	vadd.u32 ds, ds, s;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)		\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2_16(d1, d2, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2(d1, d2,  8, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 4
+.Linc_counter:
+	.long 0,1,2,3
+
+.align 3
+.globl _gcry_chacha20_armv7_neon_blocks4
+.type _gcry_chacha20_armv7_neon_blocks4,%function;
+
+_gcry_chacha20_armv7_neon_blocks4:
+	/* input:
+	 *	r0: input
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: nblks (multiple of 4)
+	 */
+
+	vpush {q4-q7};
+	push {r4-r12,lr};
+
 	mov r12, sp
-	stmia r12!, {r0-r7}
-	add r12, r12, #48
-	stmia r12!, {r0-r7}
-	sub r0, sp, #8
-	ldr sp, [sp, #192]
-	ldmfd sp!, {r4-r12, r14}
-	vldm sp!, {q4-q7}
-	sub r0, sp, r0
-	bx lr
-.Lchacha_blocks_neon_nobytes:
-	mov r0, #0;
+
+	mov r6, sp;
+	sub r6, r6, #(STACK_MAX);
+	and r6, r6, #(~15);
+	mov sp, r6;
+	GET_DATA_POINTER(r9, .Linc_counter, lr);
+	add lr, INPUT, #(12*4);
+	add r8, sp, #STACK_VEC_X12;
+
+.Loop4:
+	mov ROUND, #20;
+
+	/* Construct counter vectors X12 and X13 */
+
+	vld1.8 {X15}, [lr];
+	mov lr, INPUT;
+	vld1.8 {X8}, [r9];
+	vdup.32 X12, X15l[0];
+	vdup.32 X13, X15l[1];
+	vld1.8 {X3}, [lr]!;
+	vadd.u32 X12, X12, X8;
+	vdup.32 X0, X3l[0];
+	vdup.32 X1, X3l[1];
+	vdup.32 X2, X3h[0];
+	vcgt.u32 X8, X8, X12;
+	vdup.32 X3, X3h[1];
+	vdup.32 X14, X15h[0];
+	vdup.32 X15, X15h[1];
+	vsub.u32 X13, X13, X8;
+	vld1.8 {X7}, [lr]!;
+	vld1.8 {X11}, [lr];
+	vst1.8 {X12, X13}, [r8];
+	vdup.32 X4, X7l[0];
+	vdup.32 X5, X7l[1];
+	vdup.32 X6, X7h[0];
+	vdup.32 X7, X7h[1];
+	vdup.32 X8, X11l[0];
+	vdup.32 X9, X11l[1];
+	vdup.32 X10, X11h[0];
+	vdup.32 X11, X11h[1];
+
+	add r7, sp, #STACK_TMP2;
+	add r6, sp, #STACK_TMP1;
+	add r5, sp, #STACK_TMP;
+	vst1.8 {X15}, [r6];
+	vst1.8 {X11}, [r5];
+
+	mov lr, INPUT;
+.Lround2:
+	subs ROUND, ROUND, #2
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
+	vld1.8 {X11}, [r5];
+	vld1.8 {X15}, [r6];
+	vst1.8 {X8}, [r5];
+	vst1.8 {X9}, [r6];
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
+	vld1.8 {X8}, [r5];
+	vld1.8 {X9}, [r6];
+	vst1.8 {X11}, [r5];
+	vst1.8 {X15}, [r6];
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
+	bne .Lround2;
+
+	vld1.8 {X11}, [lr]!;
+	vst1.8 {X14}, [r7];
+
+	vdup.32 X14, X11l[0]; /* INPUT + 0 * 4 */
+	vdup.32 X15, X11l[1]; /* INPUT + 1 * 4 */
+	PLUS(X0, X14);
+	PLUS(X1, X15);
+	vdup.32 X14, X11h[0]; /* INPUT + 2 * 4 */
+	vdup.32 X15, X11h[1]; /* INPUT + 3 * 4 */
+	PLUS(X2, X14);
+	PLUS(X3, X15);
+
+	vld1.8 {X11}, [r5];
+	vld1.8 {X15}, [r6];
+	vst1.8 {X0}, [r5];
+	vld1.8 {X0}, [lr]!;
+	vst1.8 {X1}, [r6];
+
+	vdup.32 X14, X0l[0]; /* INPUT + 4 * 4 */
+	vdup.32  X1, X0l[1]; /* INPUT + 5 * 4 */
+	PLUS(X4, X14);
+	PLUS(X5, X1);
+	vdup.32 X14, X0h[0]; /* INPUT + 6 * 4 */
+	vdup.32  X1, X0h[1]; /* INPUT + 7 * 4 */
+	PLUS(X6, X14);
+	PLUS(X7, X1);
+
+	vld1.8 {X0}, [lr]!;
+
+	vdup.32 X14, X0l[0]; /* INPUT + 8 * 4 */
+	vdup.32  X1, X0l[1]; /* INPUT + 9 * 4 */
+	PLUS(X8, X14);
+	PLUS(X9, X1);
+	vdup.32 X14, X0h[0]; /* INPUT + 10 * 4 */
+	vdup.32  X1, X0h[1]; /* INPUT + 11 * 4 */
+	PLUS(X10, X14);
+	PLUS(X11, X1);
+
+	vld1.8 {X0}, [lr];
+	add lr, INPUT, #(12*4)
+	vld1.8 {X14}, [r7];
+
+	vdup.32 X1, X0h[0]; /* INPUT + 10 * 4 */
+	ldm lr, {r10, r11}; /* Update counter */
+	vdup.32 X0, X0h[1]; /* INPUT + 11 * 4 */
+	PLUS(X14, X1);
+	PLUS(X15, X0);
+	adds r10, r10, #4;  /* Update counter */
+	vld1.8 {X0, X1}, [r8];
+
+	PLUS(X12, X0);
+	vld1.8 {X0}, [r5];
+	PLUS(X13, X1);
+	adc r11, r11, #0;   /* Update counter */
+
+	vld1.8 {X1}, [r6];
+	stm lr, {r10, r11}; /* Update counter */
+	transpose_4x4_part1(X0, X1, X2, X3);
+	transpose_4x4_part1(X4, X5, X6, X7);
+	transpose_4x4_part1(X8, X9, X10, X11);
+	transpose_4x4_part1(X12, X13, X14, X15);
+	transpose_4x4_part2(X0, X1, X2, X3);
+	transpose_4x4_part2(X4, X5, X6, X7);
+	transpose_4x4_part2(X8, X9, X10, X11);
+	transpose_4x4_part2(X12, X13, X14, X15);
+
+	subs NBLKS, NBLKS, #4;
+
+	vst1.8 {X10}, [r5];
+	add lr, INPUT, #(12*4)
+	vst1.8 {X11}, [r6];
+	vld1.8 {X10, X11}, [SRC]!;
+	veor X10, X0, X10;
+	vld1.8 {X0}, [SRC]!;
+	veor X11, X4, X11;
+	vld1.8 {X4}, [SRC]!;
+	vst1.8 {X10, X11}, [DST]!;
+	vld1.8 {X10, X11}, [SRC]!;
+	veor X0, X8, X0;
+	veor X4, X12, X4;
+	veor X10, X1, X10;
+	veor X11, X5, X11;
+	vst1.8 {X0}, [DST]!;
+	vld1.8 {X0, X1}, [SRC]!;
+	vst1.8 {X4}, [DST]!;
+	vld1.8 {X4, X5}, [SRC]!;
+	vst1.8 {X10, X11}, [DST]!;
+	vld1.8 {X10}, [r5];
+	vld1.8 {X11}, [r6];
+	veor X0, X9, X0;
+	vld1.8 {X8, X9}, [SRC]!;
+	veor X1, X13, X1;
+	vld1.8 {X12, X13}, [SRC]!;
+	veor X4, X2, X4;
+	veor X5, X6, X5;
+	vst1.8 {X0, X1}, [DST]!;
+	vld1.8 {X0, X1}, [SRC]!;
+	vst1.8 {X4, X5}, [DST]!;
+	veor X8, X10, X8;
+	veor X9, X14, X9;
+	veor X12, X3, X12;
+	veor X13, X7, X13;
+	veor X0, X11, X0;
+	veor X1, X15, X1;
+	vst1.8 {X8, X9}, [DST]!;
+	vst1.8 {X12, X13}, [DST]!;
+	vst1.8 {X0, X1}, [DST]!;
+
+	bne .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	vst1.8 {X0}, [r5];
+	vst1.8 {X0}, [r6];
+	vst1.8 {X0}, [r7];
+	vst1.8 {X0}, [r8]!;
+	vst1.8 {X0}, [r8];
+
+	mov sp, r12
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	pop {r4-r12,lr}
+	vpop {q4-q7}
+	eor r0, r0, r0
 	bx lr
-.ltorg
-.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks;
+.size _gcry_chacha20_armv7_neon_blocks4, .-_gcry_chacha20_armv7_neon_blocks4;
 
 #endif
diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S
deleted file mode 100644
index 8c085bad..00000000
--- a/cipher/chacha20-avx2-amd64.S
+++ /dev/null
@@ -1,956 +0,0 @@
-/* chacha20-avx2-amd64.S  -  AMD64/AVX2 implementation of ChaCha20
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
- */
-
-#ifdef __x86_64__
-#include <config.h>
-
-#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT) && USE_CHACHA20
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-.text
-
-.align 8
-.globl _gcry_chacha20_amd64_avx2_blocks
-ELF(.type  _gcry_chacha20_amd64_avx2_blocks,@function;)
-_gcry_chacha20_amd64_avx2_blocks:
-.Lchacha_blocks_avx2_local:
-	vzeroupper
-	pushq %rbx
-	pushq %rbp
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	movq %rsp, %rbp
-	andq $~63, %rsp
-	subq $512, %rsp
-	leaq .LC RIP, %rax
-	vmovdqu 0(%rax), %xmm6
-	vmovdqu 16(%rax), %xmm7
-	vmovdqu 0(%rdi), %xmm8
-	vmovdqu 16(%rdi), %xmm9
-	vmovdqu 32(%rdi), %xmm10
-	vmovdqu 48(%rdi), %xmm11
-	movl $20, %eax
-	movq $1, %r9
-	vmovdqa %xmm8, 0(%rsp)
-	vmovdqa %xmm9, 16(%rsp)
-	vmovdqa %xmm10, 32(%rsp)
-	vmovdqa %xmm11, 48(%rsp)
-	movq %rax, 64(%rsp)
-	vmovdqa %xmm6, 448(%rsp)
-	vmovdqa %xmm6, 464(%rsp)
-	vmovdqa %xmm7, 480(%rsp)
-	vmovdqa %xmm7, 496(%rsp)
-	cmpq $512, %rcx
-	jae .Lchacha_blocks_avx2_atleast512
-	cmp $256, %rcx
-	jae .Lchacha_blocks_avx2_atleast256
-	jmp .Lchacha_blocks_avx2_below256
-	.p2align 6,,63
-.Lchacha_blocks_avx2_atleast512:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	leaq 5(%rax), %r11
-	leaq 6(%rax), %r12
-	leaq 7(%rax), %r13
-	leaq 8(%rax), %r14
-	movl %eax, 128(%rsp)
-	movl %r8d, 4+128(%rsp)
-	movl %r9d, 8+128(%rsp)
-	movl %r10d, 12+128(%rsp)
-	movl %ebx, 16+128(%rsp)
-	movl %r11d, 20+128(%rsp)
-	movl %r12d, 24+128(%rsp)
-	movl %r13d, 28+128(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	shrq $32, %rbx
-	shrq $32, %r11
-	shrq $32, %r12
-	shrq $32, %r13
-	movl %eax, 160(%rsp)
-	movl %r8d, 4+160(%rsp)
-	movl %r9d, 8+160(%rsp)
-	movl %r10d, 12+160(%rsp)
-	movl %ebx, 16+160(%rsp)
-	movl %r11d, 20+160(%rsp)
-	movl %r12d, 24+160(%rsp)
-	movl %r13d, 28+160(%rsp)
-	movq %r14, 48(%rsp)
-	movq 64(%rsp), %rax
-	vpbroadcastd 0(%rsp), %ymm0
-	vpbroadcastd 4+0(%rsp), %ymm1
-	vpbroadcastd 8+0(%rsp), %ymm2
-	vpbroadcastd 12+0(%rsp), %ymm3
-	vpbroadcastd 16(%rsp), %ymm4
-	vpbroadcastd 4+16(%rsp), %ymm5
-	vpbroadcastd 8+16(%rsp), %ymm6
-	vpbroadcastd 12+16(%rsp), %ymm7
-	vpbroadcastd 32(%rsp), %ymm8
-	vpbroadcastd 4+32(%rsp), %ymm9
-	vpbroadcastd 8+32(%rsp), %ymm10
-	vpbroadcastd 12+32(%rsp), %ymm11
-	vpbroadcastd 8+48(%rsp), %ymm14
-	vpbroadcastd 12+48(%rsp), %ymm15
-	vmovdqa 128(%rsp), %ymm12
-	vmovdqa 160(%rsp), %ymm13
-.Lchacha_blocks_avx2_mainloop1:
-	vpaddd %ymm0, %ymm4, %ymm0
-	vpaddd %ymm1, %ymm5, %ymm1
-	vpxor %ymm12, %ymm0, %ymm12
-	vpxor %ymm13, %ymm1, %ymm13
-	vpaddd %ymm2, %ymm6, %ymm2
-	vpaddd %ymm3, %ymm7, %ymm3
-	vpxor %ymm14, %ymm2, %ymm14
-	vpxor %ymm15, %ymm3, %ymm15
-	vpshufb 448(%rsp), %ymm12, %ymm12
-	vpshufb 448(%rsp), %ymm13, %ymm13
-	vpaddd %ymm8, %ymm12, %ymm8
-	vpaddd %ymm9, %ymm13, %ymm9
-	vpshufb 448(%rsp), %ymm14, %ymm14
-	vpshufb 448(%rsp), %ymm15, %ymm15
-	vpaddd %ymm10, %ymm14, %ymm10
-	vpaddd %ymm11, %ymm15, %ymm11
-	vmovdqa %ymm12, 96(%rsp)
-	vpxor %ymm4, %ymm8, %ymm4
-	vpxor %ymm5, %ymm9, %ymm5
-	vpslld $ 12, %ymm4, %ymm12
-	vpsrld $20, %ymm4, %ymm4
-	vpxor %ymm4, %ymm12, %ymm4
-	vpslld $ 12, %ymm5, %ymm12
-	vpsrld $20, %ymm5, %ymm5
-	vpxor %ymm5, %ymm12, %ymm5
-	vpxor %ymm6, %ymm10, %ymm6
-	vpxor %ymm7, %ymm11, %ymm7
-	vpslld $ 12, %ymm6, %ymm12
-	vpsrld $20, %ymm6, %ymm6
-	vpxor %ymm6, %ymm12, %ymm6
-	vpslld $ 12, %ymm7, %ymm12
-	vpsrld $20, %ymm7, %ymm7
-	vpxor %ymm7, %ymm12, %ymm7
-	vpaddd %ymm0, %ymm4, %ymm0
-	vpaddd %ymm1, %ymm5, %ymm1
-	vpxor 96(%rsp), %ymm0, %ymm12
-	vpxor %ymm13, %ymm1, %ymm13
-	vpaddd %ymm2, %ymm6, %ymm2
-	vpaddd %ymm3, %ymm7, %ymm3
-	vpxor %ymm14, %ymm2, %ymm14
-	vpxor %ymm15, %ymm3, %ymm15
-	vpshufb 480(%rsp), %ymm12, %ymm12
-	vpshufb 480(%rsp), %ymm13, %ymm13
-	vpaddd %ymm8, %ymm12, %ymm8
-	vpaddd %ymm9, %ymm13, %ymm9
-	vpshufb 480(%rsp), %ymm14, %ymm14
-	vpshufb 480(%rsp), %ymm15, %ymm15
-	vpaddd %ymm10, %ymm14, %ymm10
-	vpaddd %ymm11, %ymm15, %ymm11
-	vmovdqa %ymm12, 96(%rsp)
-	vpxor %ymm4, %ymm8, %ymm4
-	vpxor %ymm5, %ymm9, %ymm5
-	vpslld $ 7, %ymm4, %ymm12
-	vpsrld $25, %ymm4, %ymm4
-	vpxor %ymm4, %ymm12, %ymm4
-	vpslld $ 7, %ymm5, %ymm12
-	vpsrld $25, %ymm5, %ymm5
-	vpxor %ymm5, %ymm12, %ymm5
-	vpxor %ymm6, %ymm10, %ymm6
-	vpxor %ymm7, %ymm11, %ymm7
-	vpslld $ 7, %ymm6, %ymm12
-	vpsrld $25, %ymm6, %ymm6
-	vpxor %ymm6, %ymm12, %ymm6
-	vpslld $ 7, %ymm7, %ymm12
-	vpsrld $25, %ymm7, %ymm7
-	vpxor %ymm7, %ymm12, %ymm7
-	vpaddd %ymm0, %ymm5, %ymm0
-	vpaddd %ymm1, %ymm6, %ymm1
-	vpxor %ymm15, %ymm0, %ymm15
-	vpxor 96(%rsp), %ymm1, %ymm12
-	vpaddd %ymm2, %ymm7, %ymm2
-	vpaddd %ymm3, %ymm4, %ymm3
-	vpxor %ymm13, %ymm2, %ymm13
-	vpxor %ymm14, %ymm3, %ymm14
-	vpshufb 448(%rsp), %ymm15, %ymm15
-	vpshufb 448(%rsp), %ymm12, %ymm12
-	vpaddd %ymm10, %ymm15, %ymm10
-	vpaddd %ymm11, %ymm12, %ymm11
-	vpshufb 448(%rsp), %ymm13, %ymm13
-	vpshufb 448(%rsp), %ymm14, %ymm14
-	vpaddd %ymm8, %ymm13, %ymm8
-	vpaddd %ymm9, %ymm14, %ymm9
-	vmovdqa %ymm15, 96(%rsp)
-	vpxor %ymm5, %ymm10, %ymm5
-	vpxor %ymm6, %ymm11, %ymm6
-	vpslld $ 12, %ymm5, %ymm15
-	vpsrld $20, %ymm5, %ymm5
-	vpxor %ymm5, %ymm15, %ymm5
-	vpslld $ 12, %ymm6, %ymm15
-	vpsrld $20, %ymm6, %ymm6
-	vpxor %ymm6, %ymm15, %ymm6
-	vpxor %ymm7, %ymm8, %ymm7
-	vpxor %ymm4, %ymm9, %ymm4
-	vpslld $ 12, %ymm7, %ymm15
-	vpsrld $20, %ymm7, %ymm7
-	vpxor %ymm7, %ymm15, %ymm7
-	vpslld $ 12, %ymm4, %ymm15
-	vpsrld $20, %ymm4, %ymm4
-	vpxor %ymm4, %ymm15, %ymm4
-	vpaddd %ymm0, %ymm5, %ymm0
-	vpaddd %ymm1, %ymm6, %ymm1
-	vpxor 96(%rsp), %ymm0, %ymm15
-	vpxor %ymm12, %ymm1, %ymm12
-	vpaddd %ymm2, %ymm7, %ymm2
-	vpaddd %ymm3, %ymm4, %ymm3
-	vpxor %ymm13, %ymm2, %ymm13
-	vpxor %ymm14, %ymm3, %ymm14
-	vpshufb 480(%rsp), %ymm15, %ymm15
-	vpshufb 480(%rsp), %ymm12, %ymm12
-	vpaddd %ymm10, %ymm15, %ymm10
-	vpaddd %ymm11, %ymm12, %ymm11
-	vpshufb 480(%rsp), %ymm13, %ymm13
-	vpshufb 480(%rsp), %ymm14, %ymm14
-	vpaddd %ymm8, %ymm13, %ymm8
-	vpaddd %ymm9, %ymm14, %ymm9
-	vmovdqa %ymm15, 96(%rsp)
-	vpxor %ymm5, %ymm10, %ymm5
-	vpxor %ymm6, %ymm11, %ymm6
-	vpslld $ 7, %ymm5, %ymm15
-	vpsrld $25, %ymm5, %ymm5
-	vpxor %ymm5, %ymm15, %ymm5
-	vpslld $ 7, %ymm6, %ymm15
-	vpsrld $25, %ymm6, %ymm6
-	vpxor %ymm6, %ymm15, %ymm6
-	vpxor %ymm7, %ymm8, %ymm7
-	vpxor %ymm4, %ymm9, %ymm4
-	vpslld $ 7, %ymm7, %ymm15
-	vpsrld $25, %ymm7, %ymm7
-	vpxor %ymm7, %ymm15, %ymm7
-	vpslld $ 7, %ymm4, %ymm15
-	vpsrld $25, %ymm4, %ymm4
-	vpxor %ymm4, %ymm15, %ymm4
-	vmovdqa 96(%rsp), %ymm15
-	subq $2, %rax
-	jnz .Lchacha_blocks_avx2_mainloop1
-	vmovdqa %ymm8, 192(%rsp)
-	vmovdqa %ymm9, 224(%rsp)
-	vmovdqa %ymm10, 256(%rsp)
-	vmovdqa %ymm11, 288(%rsp)
-	vmovdqa %ymm12, 320(%rsp)
-	vmovdqa %ymm13, 352(%rsp)
-	vmovdqa %ymm14, 384(%rsp)
-	vmovdqa %ymm15, 416(%rsp)
-	vpbroadcastd 0(%rsp), %ymm8
-	vpbroadcastd 4+0(%rsp), %ymm9
-	vpbroadcastd 8+0(%rsp), %ymm10
-	vpbroadcastd 12+0(%rsp), %ymm11
-	vpbroadcastd 16(%rsp), %ymm12
-	vpbroadcastd 4+16(%rsp), %ymm13
-	vpbroadcastd 8+16(%rsp), %ymm14
-	vpbroadcastd 12+16(%rsp), %ymm15
-	vpaddd %ymm8, %ymm0, %ymm0
-	vpaddd %ymm9, %ymm1, %ymm1
-	vpaddd %ymm10, %ymm2, %ymm2
-	vpaddd %ymm11, %ymm3, %ymm3
-	vpaddd %ymm12, %ymm4, %ymm4
-	vpaddd %ymm13, %ymm5, %ymm5
-	vpaddd %ymm14, %ymm6, %ymm6
-	vpaddd %ymm15, %ymm7, %ymm7
-	vpunpckldq %ymm1, %ymm0, %ymm8
-	vpunpckldq %ymm3, %ymm2, %ymm9
-	vpunpckhdq %ymm1, %ymm0, %ymm12
-	vpunpckhdq %ymm3, %ymm2, %ymm13
-	vpunpckldq %ymm5, %ymm4, %ymm10
-	vpunpckldq %ymm7, %ymm6, %ymm11
-	vpunpckhdq %ymm5, %ymm4, %ymm14
-	vpunpckhdq %ymm7, %ymm6, %ymm15
-	vpunpcklqdq %ymm9, %ymm8, %ymm0
-	vpunpcklqdq %ymm11, %ymm10, %ymm1
-	vpunpckhqdq %ymm9, %ymm8, %ymm2
-	vpunpckhqdq %ymm11, %ymm10, %ymm3
-	vpunpcklqdq %ymm13, %ymm12, %ymm4
-	vpunpcklqdq %ymm15, %ymm14, %ymm5
-	vpunpckhqdq %ymm13, %ymm12, %ymm6
-	vpunpckhqdq %ymm15, %ymm14, %ymm7
-	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
-	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
-	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
-	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
-	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
-	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
-	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
-	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput1
-	vpxor 0(%rsi), %ymm8, %ymm8
-	vpxor 64(%rsi), %ymm9, %ymm9
-	vpxor 128(%rsi), %ymm10, %ymm10
-	vpxor 192(%rsi), %ymm11, %ymm11
-	vpxor 256(%rsi), %ymm12, %ymm12
-	vpxor 320(%rsi), %ymm13, %ymm13
-	vpxor 384(%rsi), %ymm14, %ymm14
-	vpxor 448(%rsi), %ymm15, %ymm15
-	vmovdqu %ymm8, 0(%rdx)
-	vmovdqu %ymm9, 64(%rdx)
-	vmovdqu %ymm10, 128(%rdx)
-	vmovdqu %ymm11, 192(%rdx)
-	vmovdqu %ymm12, 256(%rdx)
-	vmovdqu %ymm13, 320(%rdx)
-	vmovdqu %ymm14, 384(%rdx)
-	vmovdqu %ymm15, 448(%rdx)
-	vmovdqa 192(%rsp), %ymm0
-	vmovdqa 224(%rsp), %ymm1
-	vmovdqa 256(%rsp), %ymm2
-	vmovdqa 288(%rsp), %ymm3
-	vmovdqa 320(%rsp), %ymm4
-	vmovdqa 352(%rsp), %ymm5
-	vmovdqa 384(%rsp), %ymm6
-	vmovdqa 416(%rsp), %ymm7
-	vpbroadcastd 32(%rsp), %ymm8
-	vpbroadcastd 4+32(%rsp), %ymm9
-	vpbroadcastd 8+32(%rsp), %ymm10
-	vpbroadcastd 12+32(%rsp), %ymm11
-	vmovdqa 128(%rsp), %ymm12
-	vmovdqa 160(%rsp), %ymm13
-	vpbroadcastd 8+48(%rsp), %ymm14
-	vpbroadcastd 12+48(%rsp), %ymm15
-	vpaddd %ymm8, %ymm0, %ymm0
-	vpaddd %ymm9, %ymm1, %ymm1
-	vpaddd %ymm10, %ymm2, %ymm2
-	vpaddd %ymm11, %ymm3, %ymm3
-	vpaddd %ymm12, %ymm4, %ymm4
-	vpaddd %ymm13, %ymm5, %ymm5
-	vpaddd %ymm14, %ymm6, %ymm6
-	vpaddd %ymm15, %ymm7, %ymm7
-	vpunpckldq %ymm1, %ymm0, %ymm8
-	vpunpckldq %ymm3, %ymm2, %ymm9
-	vpunpckhdq %ymm1, %ymm0, %ymm12
-	vpunpckhdq %ymm3, %ymm2, %ymm13
-	vpunpckldq %ymm5, %ymm4, %ymm10
-	vpunpckldq %ymm7, %ymm6, %ymm11
-	vpunpckhdq %ymm5, %ymm4, %ymm14
-	vpunpckhdq %ymm7, %ymm6, %ymm15
-	vpunpcklqdq %ymm9, %ymm8, %ymm0
-	vpunpcklqdq %ymm11, %ymm10, %ymm1
-	vpunpckhqdq %ymm9, %ymm8, %ymm2
-	vpunpckhqdq %ymm11, %ymm10, %ymm3
-	vpunpcklqdq %ymm13, %ymm12, %ymm4
-	vpunpcklqdq %ymm15, %ymm14, %ymm5
-	vpunpckhqdq %ymm13, %ymm12, %ymm6
-	vpunpckhqdq %ymm15, %ymm14, %ymm7
-	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
-	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
-	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
-	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
-	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
-	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
-	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
-	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
-	vpxor 32(%rsi), %ymm8, %ymm8
-	vpxor 96(%rsi), %ymm9, %ymm9
-	vpxor 160(%rsi), %ymm10, %ymm10
-	vpxor 224(%rsi), %ymm11, %ymm11
-	vpxor 288(%rsi), %ymm12, %ymm12
-	vpxor 352(%rsi), %ymm13, %ymm13
-	vpxor 416(%rsi), %ymm14, %ymm14
-	vpxor 480(%rsi), %ymm15, %ymm15
-	vmovdqu %ymm8, 32(%rdx)
-	vmovdqu %ymm9, 96(%rdx)
-	vmovdqu %ymm10, 160(%rdx)
-	vmovdqu %ymm11, 224(%rdx)
-	vmovdqu %ymm12, 288(%rdx)
-	vmovdqu %ymm13, 352(%rdx)
-	vmovdqu %ymm14, 416(%rdx)
-	vmovdqu %ymm15, 480(%rdx)
-	addq $512, %rsi
-	jmp .Lchacha_blocks_avx2_mainloop1_cont
-.Lchacha_blocks_avx2_noinput1:
-	vmovdqu %ymm8, 0(%rdx)
-	vmovdqu %ymm9, 64(%rdx)
-	vmovdqu %ymm10, 128(%rdx)
-	vmovdqu %ymm11, 192(%rdx)
-	vmovdqu %ymm12, 256(%rdx)
-	vmovdqu %ymm13, 320(%rdx)
-	vmovdqu %ymm14, 384(%rdx)
-	vmovdqu %ymm15, 448(%rdx)
-	vmovdqa 192(%rsp), %ymm0
-	vmovdqa 224(%rsp), %ymm1
-	vmovdqa 256(%rsp), %ymm2
-	vmovdqa 288(%rsp), %ymm3
-	vmovdqa 320(%rsp), %ymm4
-	vmovdqa 352(%rsp), %ymm5
-	vmovdqa 384(%rsp), %ymm6
-	vmovdqa 416(%rsp), %ymm7
-	vpbroadcastd 32(%rsp), %ymm8
-	vpbroadcastd 4+32(%rsp), %ymm9
-	vpbroadcastd 8+32(%rsp), %ymm10
-	vpbroadcastd 12+32(%rsp), %ymm11
-	vmovdqa 128(%rsp), %ymm12
-	vmovdqa 160(%rsp), %ymm13
-	vpbroadcastd 8+48(%rsp), %ymm14
-	vpbroadcastd 12+48(%rsp), %ymm15
-	vpaddd %ymm8, %ymm0, %ymm0
-	vpaddd %ymm9, %ymm1, %ymm1
-	vpaddd %ymm10, %ymm2, %ymm2
-	vpaddd %ymm11, %ymm3, %ymm3
-	vpaddd %ymm12, %ymm4, %ymm4
-	vpaddd %ymm13, %ymm5, %ymm5
-	vpaddd %ymm14, %ymm6, %ymm6
-	vpaddd %ymm15, %ymm7, %ymm7
-	vpunpckldq %ymm1, %ymm0, %ymm8
-	vpunpckldq %ymm3, %ymm2, %ymm9
-	vpunpckhdq %ymm1, %ymm0, %ymm12
-	vpunpckhdq %ymm3, %ymm2, %ymm13
-	vpunpckldq %ymm5, %ymm4, %ymm10
-	vpunpckldq %ymm7, %ymm6, %ymm11
-	vpunpckhdq %ymm5, %ymm4, %ymm14
-	vpunpckhdq %ymm7, %ymm6, %ymm15
-	vpunpcklqdq %ymm9, %ymm8, %ymm0
-	vpunpcklqdq %ymm11, %ymm10, %ymm1
-	vpunpckhqdq %ymm9, %ymm8, %ymm2
-	vpunpckhqdq %ymm11, %ymm10, %ymm3
-	vpunpcklqdq %ymm13, %ymm12, %ymm4
-	vpunpcklqdq %ymm15, %ymm14, %ymm5
-	vpunpckhqdq %ymm13, %ymm12, %ymm6
-	vpunpckhqdq %ymm15, %ymm14, %ymm7
-	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
-	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
-	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
-	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
-	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
-	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
-	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
-	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
-	vmovdqu %ymm8, 32(%rdx)
-	vmovdqu %ymm9, 96(%rdx)
-	vmovdqu %ymm10, 160(%rdx)
-	vmovdqu %ymm11, 224(%rdx)
-	vmovdqu %ymm12, 288(%rdx)
-	vmovdqu %ymm13, 352(%rdx)
-	vmovdqu %ymm14, 416(%rdx)
-	vmovdqu %ymm15, 480(%rdx)
-.Lchacha_blocks_avx2_mainloop1_cont:
-	addq $512, %rdx
-	subq $512, %rcx
-	cmp $512, %rcx
-	jae .Lchacha_blocks_avx2_atleast512
-	cmp $256, %rcx
-	jb .Lchacha_blocks_avx2_below256_fixup
-.Lchacha_blocks_avx2_atleast256:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	movl %eax, 128(%rsp)
-	movl %r8d, 4+128(%rsp)
-	movl %r9d, 8+128(%rsp)
-	movl %r10d, 12+128(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	movl %eax, 160(%rsp)
-	movl %r8d, 4+160(%rsp)
-	movl %r9d, 8+160(%rsp)
-	movl %r10d, 12+160(%rsp)
-	movq %rbx, 48(%rsp)
-	movq 64(%rsp), %rax
-	vpbroadcastd 0(%rsp), %xmm0
-	vpbroadcastd 4+0(%rsp), %xmm1
-	vpbroadcastd 8+0(%rsp), %xmm2
-	vpbroadcastd 12+0(%rsp), %xmm3
-	vpbroadcastd 16(%rsp), %xmm4
-	vpbroadcastd 4+16(%rsp), %xmm5
-	vpbroadcastd 8+16(%rsp), %xmm6
-	vpbroadcastd 12+16(%rsp), %xmm7
-	vpbroadcastd 32(%rsp), %xmm8
-	vpbroadcastd 4+32(%rsp), %xmm9
-	vpbroadcastd 8+32(%rsp), %xmm10
-	vpbroadcastd 12+32(%rsp), %xmm11
-	vmovdqa 128(%rsp), %xmm12
-	vmovdqa 160(%rsp), %xmm13
-	vpbroadcastd 8+48(%rsp), %xmm14
-	vpbroadcastd 12+48(%rsp), %xmm15
-.Lchacha_blocks_avx2_mainloop2:
-	vpaddd %xmm0, %xmm4, %xmm0
-	vpaddd %xmm1, %xmm5, %xmm1
-	vpxor %xmm12, %xmm0, %xmm12
-	vpxor %xmm13, %xmm1, %xmm13
-	vpaddd %xmm2, %xmm6, %xmm2
-	vpaddd %xmm3, %xmm7, %xmm3
-	vpxor %xmm14, %xmm2, %xmm14
-	vpxor %xmm15, %xmm3, %xmm15
-	vpshufb 448(%rsp), %xmm12, %xmm12
-	vpshufb 448(%rsp), %xmm13, %xmm13
-	vpaddd %xmm8, %xmm12, %xmm8
-	vpaddd %xmm9, %xmm13, %xmm9
-	vpshufb 448(%rsp), %xmm14, %xmm14
-	vpshufb 448(%rsp), %xmm15, %xmm15
-	vpaddd %xmm10, %xmm14, %xmm10
-	vpaddd %xmm11, %xmm15, %xmm11
-	vmovdqa %xmm12, 96(%rsp)
-	vpxor %xmm4, %xmm8, %xmm4
-	vpxor %xmm5, %xmm9, %xmm5
-	vpslld $ 12, %xmm4, %xmm12
-	vpsrld $20, %xmm4, %xmm4
-	vpxor %xmm4, %xmm12, %xmm4
-	vpslld $ 12, %xmm5, %xmm12
-	vpsrld $20, %xmm5, %xmm5
-	vpxor %xmm5, %xmm12, %xmm5
-	vpxor %xmm6, %xmm10, %xmm6
-	vpxor %xmm7, %xmm11, %xmm7
-	vpslld $ 12, %xmm6, %xmm12
-	vpsrld $20, %xmm6, %xmm6
-	vpxor %xmm6, %xmm12, %xmm6
-	vpslld $ 12, %xmm7, %xmm12
-	vpsrld $20, %xmm7, %xmm7
-	vpxor %xmm7, %xmm12, %xmm7
-	vpaddd %xmm0, %xmm4, %xmm0
-	vpaddd %xmm1, %xmm5, %xmm1
-	vpxor 96(%rsp), %xmm0, %xmm12
-	vpxor %xmm13, %xmm1, %xmm13
-	vpaddd %xmm2, %xmm6, %xmm2
-	vpaddd %xmm3, %xmm7, %xmm3
-	vpxor %xmm14, %xmm2, %xmm14
-	vpxor %xmm15, %xmm3, %xmm15
-	vpshufb 480(%rsp), %xmm12, %xmm12
-	vpshufb 480(%rsp), %xmm13, %xmm13
-	vpaddd %xmm8, %xmm12, %xmm8
-	vpaddd %xmm9, %xmm13, %xmm9
-	vpshufb 480(%rsp), %xmm14, %xmm14
-	vpshufb 480(%rsp), %xmm15, %xmm15
-	vpaddd %xmm10, %xmm14, %xmm10
-	vpaddd %xmm11, %xmm15, %xmm11
-	vmovdqa %xmm12, 96(%rsp)
-	vpxor %xmm4, %xmm8, %xmm4
-	vpxor %xmm5, %xmm9, %xmm5
-	vpslld $ 7, %xmm4, %xmm12
-	vpsrld $25, %xmm4, %xmm4
-	vpxor %xmm4, %xmm12, %xmm4
-	vpslld $ 7, %xmm5, %xmm12
-	vpsrld $25, %xmm5, %xmm5
-	vpxor %xmm5, %xmm12, %xmm5
-	vpxor %xmm6, %xmm10, %xmm6
-	vpxor %xmm7, %xmm11, %xmm7
-	vpslld $ 7, %xmm6, %xmm12
-	vpsrld $25, %xmm6, %xmm6
-	vpxor %xmm6, %xmm12, %xmm6
-	vpslld $ 7, %xmm7, %xmm12
-	vpsrld $25, %xmm7, %xmm7
-	vpxor %xmm7, %xmm12, %xmm7
-	vpaddd %xmm0, %xmm5, %xmm0
-	vpaddd %xmm1, %xmm6, %xmm1
-	vpxor %xmm15, %xmm0, %xmm15
-	vpxor 96(%rsp), %xmm1, %xmm12
-	vpaddd %xmm2, %xmm7, %xmm2
-	vpaddd %xmm3, %xmm4, %xmm3
-	vpxor %xmm13, %xmm2, %xmm13
-	vpxor %xmm14, %xmm3, %xmm14
-	vpshufb 448(%rsp), %xmm15, %xmm15
-	vpshufb 448(%rsp), %xmm12, %xmm12
-	vpaddd %xmm10, %xmm15, %xmm10
-	vpaddd %xmm11, %xmm12, %xmm11
-	vpshufb 448(%rsp), %xmm13, %xmm13
-	vpshufb 448(%rsp), %xmm14, %xmm14
-	vpaddd %xmm8, %xmm13, %xmm8
-	vpaddd %xmm9, %xmm14, %xmm9
-	vmovdqa %xmm15, 96(%rsp)
-	vpxor %xmm5, %xmm10, %xmm5
-	vpxor %xmm6, %xmm11, %xmm6
-	vpslld $ 12, %xmm5, %xmm15
-	vpsrld $20, %xmm5, %xmm5
-	vpxor %xmm5, %xmm15, %xmm5
-	vpslld $ 12, %xmm6, %xmm15
-	vpsrld $20, %xmm6, %xmm6
-	vpxor %xmm6, %xmm15, %xmm6
-	vpxor %xmm7, %xmm8, %xmm7
-	vpxor %xmm4, %xmm9, %xmm4
-	vpslld $ 12, %xmm7, %xmm15
-	vpsrld $20, %xmm7, %xmm7
-	vpxor %xmm7, %xmm15, %xmm7
-	vpslld $ 12, %xmm4, %xmm15
-	vpsrld $20, %xmm4, %xmm4
-	vpxor %xmm4, %xmm15, %xmm4
-	vpaddd %xmm0, %xmm5, %xmm0
-	vpaddd %xmm1, %xmm6, %xmm1
-	vpxor 96(%rsp), %xmm0, %xmm15
-	vpxor %xmm12, %xmm1, %xmm12
-	vpaddd %xmm2, %xmm7, %xmm2
-	vpaddd %xmm3, %xmm4, %xmm3
-	vpxor %xmm13, %xmm2, %xmm13
-	vpxor %xmm14, %xmm3, %xmm14
-	vpshufb 480(%rsp), %xmm15, %xmm15
-	vpshufb 480(%rsp), %xmm12, %xmm12
-	vpaddd %xmm10, %xmm15, %xmm10
-	vpaddd %xmm11, %xmm12, %xmm11
-	vpshufb 480(%rsp), %xmm13, %xmm13
-	vpshufb 480(%rsp), %xmm14, %xmm14
-	vpaddd %xmm8, %xmm13, %xmm8
-	vpaddd %xmm9, %xmm14, %xmm9
-	vmovdqa %xmm15, 96(%rsp)
-	vpxor %xmm5, %xmm10, %xmm5
-	vpxor %xmm6, %xmm11, %xmm6
-	vpslld $ 7, %xmm5, %xmm15
-	vpsrld $25, %xmm5, %xmm5
-	vpxor %xmm5, %xmm15, %xmm5
-	vpslld $ 7, %xmm6, %xmm15
-	vpsrld $25, %xmm6, %xmm6
-	vpxor %xmm6, %xmm15, %xmm6
-	vpxor %xmm7, %xmm8, %xmm7
-	vpxor %xmm4, %xmm9, %xmm4
-	vpslld $ 7, %xmm7, %xmm15
-	vpsrld $25, %xmm7, %xmm7
-	vpxor %xmm7, %xmm15, %xmm7
-	vpslld $ 7, %xmm4, %xmm15
-	vpsrld $25, %xmm4, %xmm4
-	vpxor %xmm4, %xmm15, %xmm4
-	vmovdqa 96(%rsp), %xmm15
-	subq $2, %rax
-	jnz .Lchacha_blocks_avx2_mainloop2
-	vmovdqa %xmm8, 192(%rsp)
-	vmovdqa %xmm9, 208(%rsp)
-	vmovdqa %xmm10, 224(%rsp)
-	vmovdqa %xmm11, 240(%rsp)
-	vmovdqa %xmm12, 256(%rsp)
-	vmovdqa %xmm13, 272(%rsp)
-	vmovdqa %xmm14, 288(%rsp)
-	vmovdqa %xmm15, 304(%rsp)
-	vpbroadcastd 0(%rsp), %xmm8
-	vpbroadcastd 4+0(%rsp), %xmm9
-	vpbroadcastd 8+0(%rsp), %xmm10
-	vpbroadcastd 12+0(%rsp), %xmm11
-	vpbroadcastd 16(%rsp), %xmm12
-	vpbroadcastd 4+16(%rsp), %xmm13
-	vpbroadcastd 8+16(%rsp), %xmm14
-	vpbroadcastd 12+16(%rsp), %xmm15
-	vpaddd %xmm8, %xmm0, %xmm0
-	vpaddd %xmm9, %xmm1, %xmm1
-	vpaddd %xmm10, %xmm2, %xmm2
-	vpaddd %xmm11, %xmm3, %xmm3
-	vpaddd %xmm12, %xmm4, %xmm4
-	vpaddd %xmm13, %xmm5, %xmm5
-	vpaddd %xmm14, %xmm6, %xmm6
-	vpaddd %xmm15, %xmm7, %xmm7
-	vpunpckldq %xmm1, %xmm0, %xmm8
-	vpunpckldq %xmm3, %xmm2, %xmm9
-	vpunpckhdq %xmm1, %xmm0, %xmm12
-	vpunpckhdq %xmm3, %xmm2, %xmm13
-	vpunpckldq %xmm5, %xmm4, %xmm10
-	vpunpckldq %xmm7, %xmm6, %xmm11
-	vpunpckhdq %xmm5, %xmm4, %xmm14
-	vpunpckhdq %xmm7, %xmm6, %xmm15
-	vpunpcklqdq %xmm9, %xmm8, %xmm0
-	vpunpcklqdq %xmm11, %xmm10, %xmm1
-	vpunpckhqdq %xmm9, %xmm8, %xmm2
-	vpunpckhqdq %xmm11, %xmm10, %xmm3
-	vpunpcklqdq %xmm13, %xmm12, %xmm4
-	vpunpcklqdq %xmm15, %xmm14, %xmm5
-	vpunpckhqdq %xmm13, %xmm12, %xmm6
-	vpunpckhqdq %xmm15, %xmm14, %xmm7
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput2
-	vpxor 0(%rsi), %xmm0, %xmm0
-	vpxor 16(%rsi), %xmm1, %xmm1
-	vpxor 64(%rsi), %xmm2, %xmm2
-	vpxor 80(%rsi), %xmm3, %xmm3
-	vpxor 128(%rsi), %xmm4, %xmm4
-	vpxor 144(%rsi), %xmm5, %xmm5
-	vpxor 192(%rsi), %xmm6, %xmm6
-	vpxor 208(%rsi), %xmm7, %xmm7
-	vmovdqu %xmm0, 0(%rdx)
-	vmovdqu %xmm1, 16(%rdx)
-	vmovdqu %xmm2, 64(%rdx)
-	vmovdqu %xmm3, 80(%rdx)
-	vmovdqu %xmm4, 128(%rdx)
-	vmovdqu %xmm5, 144(%rdx)
-	vmovdqu %xmm6, 192(%rdx)
-	vmovdqu %xmm7, 208(%rdx)
-	vmovdqa 192(%rsp), %xmm0
-	vmovdqa 208(%rsp), %xmm1
-	vmovdqa 224(%rsp), %xmm2
-	vmovdqa 240(%rsp), %xmm3
-	vmovdqa 256(%rsp), %xmm4
-	vmovdqa 272(%rsp), %xmm5
-	vmovdqa 288(%rsp), %xmm6
-	vmovdqa 304(%rsp), %xmm7
-	vpbroadcastd 32(%rsp), %xmm8
-	vpbroadcastd 4+32(%rsp), %xmm9
-	vpbroadcastd 8+32(%rsp), %xmm10
-	vpbroadcastd 12+32(%rsp), %xmm11
-	vmovdqa 128(%rsp), %xmm12
-	vmovdqa 160(%rsp), %xmm13
-	vpbroadcastd 8+48(%rsp), %xmm14
-	vpbroadcastd 12+48(%rsp), %xmm15
-	vpaddd %xmm8, %xmm0, %xmm0
-	vpaddd %xmm9, %xmm1, %xmm1
-	vpaddd %xmm10, %xmm2, %xmm2
-	vpaddd %xmm11, %xmm3, %xmm3
-	vpaddd %xmm12, %xmm4, %xmm4
-	vpaddd %xmm13, %xmm5, %xmm5
-	vpaddd %xmm14, %xmm6, %xmm6
-	vpaddd %xmm15, %xmm7, %xmm7
-	vpunpckldq %xmm1, %xmm0, %xmm8
-	vpunpckldq %xmm3, %xmm2, %xmm9
-	vpunpckhdq %xmm1, %xmm0, %xmm12
-	vpunpckhdq %xmm3, %xmm2, %xmm13
-	vpunpckldq %xmm5, %xmm4, %xmm10
-	vpunpckldq %xmm7, %xmm6, %xmm11
-	vpunpckhdq %xmm5, %xmm4, %xmm14
-	vpunpckhdq %xmm7, %xmm6, %xmm15
-	vpunpcklqdq %xmm9, %xmm8, %xmm0
-	vpunpcklqdq %xmm11, %xmm10, %xmm1
-	vpunpckhqdq %xmm9, %xmm8, %xmm2
-	vpunpckhqdq %xmm11, %xmm10, %xmm3
-	vpunpcklqdq %xmm13, %xmm12, %xmm4
-	vpunpcklqdq %xmm15, %xmm14, %xmm5
-	vpunpckhqdq %xmm13, %xmm12, %xmm6
-	vpunpckhqdq %xmm15, %xmm14, %xmm7
-	vpxor 32(%rsi), %xmm0, %xmm0
-	vpxor 48(%rsi), %xmm1, %xmm1
-	vpxor 96(%rsi), %xmm2, %xmm2
-	vpxor 112(%rsi), %xmm3, %xmm3
-	vpxor 160(%rsi), %xmm4, %xmm4
-	vpxor 176(%rsi), %xmm5, %xmm5
-	vpxor 224(%rsi), %xmm6, %xmm6
-	vpxor 240(%rsi), %xmm7, %xmm7
-	vmovdqu %xmm0, 32(%rdx)
-	vmovdqu %xmm1, 48(%rdx)
-	vmovdqu %xmm2, 96(%rdx)
-	vmovdqu %xmm3, 112(%rdx)
-	vmovdqu %xmm4, 160(%rdx)
-	vmovdqu %xmm5, 176(%rdx)
-	vmovdqu %xmm6, 224(%rdx)
-	vmovdqu %xmm7, 240(%rdx)
-	addq $256, %rsi
-	jmp .Lchacha_blocks_avx2_mainloop2_cont
-.Lchacha_blocks_avx2_noinput2:
-	vmovdqu %xmm0, 0(%rdx)
-	vmovdqu %xmm1, 16(%rdx)
-	vmovdqu %xmm2, 64(%rdx)
-	vmovdqu %xmm3, 80(%rdx)
-	vmovdqu %xmm4, 128(%rdx)
-	vmovdqu %xmm5, 144(%rdx)
-	vmovdqu %xmm6, 192(%rdx)
-	vmovdqu %xmm7, 208(%rdx)
-	vmovdqa 192(%rsp), %xmm0
-	vmovdqa 208(%rsp), %xmm1
-	vmovdqa 224(%rsp), %xmm2
-	vmovdqa 240(%rsp), %xmm3
-	vmovdqa 256(%rsp), %xmm4
-	vmovdqa 272(%rsp), %xmm5
-	vmovdqa 288(%rsp), %xmm6
-	vmovdqa 304(%rsp), %xmm7
-	vpbroadcastd 32(%rsp), %xmm8
-	vpbroadcastd 4+32(%rsp), %xmm9
-	vpbroadcastd 8+32(%rsp), %xmm10
-	vpbroadcastd 12+32(%rsp), %xmm11
-	vmovdqa 128(%rsp), %xmm12
-	vmovdqa 160(%rsp), %xmm13
-	vpbroadcastd 8+48(%rsp), %xmm14
-	vpbroadcastd 12+48(%rsp), %xmm15
-	vpaddd %xmm8, %xmm0, %xmm0
-	vpaddd %xmm9, %xmm1, %xmm1
-	vpaddd %xmm10, %xmm2, %xmm2
-	vpaddd %xmm11, %xmm3, %xmm3
-	vpaddd %xmm12, %xmm4, %xmm4
-	vpaddd %xmm13, %xmm5, %xmm5
-	vpaddd %xmm14, %xmm6, %xmm6
-	vpaddd %xmm15, %xmm7, %xmm7
-	vpunpckldq %xmm1, %xmm0, %xmm8
-	vpunpckldq %xmm3, %xmm2, %xmm9
-	vpunpckhdq %xmm1, %xmm0, %xmm12
-	vpunpckhdq %xmm3, %xmm2, %xmm13
-	vpunpckldq %xmm5, %xmm4, %xmm10
-	vpunpckldq %xmm7, %xmm6, %xmm11
-	vpunpckhdq %xmm5, %xmm4, %xmm14
-	vpunpckhdq %xmm7, %xmm6, %xmm15
-	vpunpcklqdq %xmm9, %xmm8, %xmm0
-	vpunpcklqdq %xmm11, %xmm10, %xmm1
-	vpunpckhqdq %xmm9, %xmm8, %xmm2
-	vpunpckhqdq %xmm11, %xmm10, %xmm3
-	vpunpcklqdq %xmm13, %xmm12, %xmm4
-	vpunpcklqdq %xmm15, %xmm14, %xmm5
-	vpunpckhqdq %xmm13, %xmm12, %xmm6
-	vpunpckhqdq %xmm15, %xmm14, %xmm7
-	vmovdqu %xmm0, 32(%rdx)
-	vmovdqu %xmm1, 48(%rdx)
-	vmovdqu %xmm2, 96(%rdx)
-	vmovdqu %xmm3, 112(%rdx)
-	vmovdqu %xmm4, 160(%rdx)
-	vmovdqu %xmm5, 176(%rdx)
-	vmovdqu %xmm6, 224(%rdx)
-	vmovdqu %xmm7, 240(%rdx)
-.Lchacha_blocks_avx2_mainloop2_cont:
-	addq $256, %rdx
-	subq $256, %rcx
-	cmp $256, %rcx
-	jae .Lchacha_blocks_avx2_atleast256
-.Lchacha_blocks_avx2_below256_fixup:
-	vmovdqa 448(%rsp), %xmm6
-	vmovdqa 480(%rsp), %xmm7
-	vmovdqa 0(%rsp), %xmm8
-	vmovdqa 16(%rsp), %xmm9
-	vmovdqa 32(%rsp), %xmm10
-	vmovdqa 48(%rsp), %xmm11
-	movq $1, %r9
-.Lchacha_blocks_avx2_below256:
-	vmovq %r9, %xmm5
-	andq %rcx, %rcx
-	jz .Lchacha_blocks_avx2_done
-	cmpq $64, %rcx
-	jae .Lchacha_blocks_avx2_above63
-	movq %rdx, %r9
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput3
-	movq %rcx, %r10
-	movq %rsp, %rdx
-	addq %r10, %rsi
-	addq %r10, %rdx
-	negq %r10
-.Lchacha_blocks_avx2_copyinput:
-	movb (%rsi, %r10), %al
-	movb %al, (%rdx, %r10)
-	incq %r10
-	jnz .Lchacha_blocks_avx2_copyinput
-	movq %rsp, %rsi
-.Lchacha_blocks_avx2_noinput3:
-	movq %rsp, %rdx
-.Lchacha_blocks_avx2_above63:
-	vmovdqa %xmm8, %xmm0
-	vmovdqa %xmm9, %xmm1
-	vmovdqa %xmm10, %xmm2
-	vmovdqa %xmm11, %xmm3
-	movq 64(%rsp), %rax
-.Lchacha_blocks_avx2_mainloop3:
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm6, %xmm3, %xmm3
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpxor %xmm1, %xmm2, %xmm1
-	vpslld $12, %xmm1, %xmm4
-	vpsrld $20, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm7, %xmm3, %xmm3
-	vpshufd $0x93, %xmm0, %xmm0
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpshufd $0x4e, %xmm3, %xmm3
-	vpxor %xmm1, %xmm2, %xmm1
-	vpshufd $0x39, %xmm2, %xmm2
-	vpslld $7, %xmm1, %xmm4
-	vpsrld $25, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm6, %xmm3, %xmm3
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpxor %xmm1, %xmm2, %xmm1
-	vpslld $12, %xmm1, %xmm4
-	vpsrld $20, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	vpaddd %xmm0, %xmm1, %xmm0
-	vpxor %xmm3, %xmm0, %xmm3
-	vpshufb %xmm7, %xmm3, %xmm3
-	vpshufd $0x39, %xmm0, %xmm0
-	vpaddd %xmm2, %xmm3, %xmm2
-	vpshufd $0x4e, %xmm3, %xmm3
-	vpxor %xmm1, %xmm2, %xmm1
-	vpshufd $0x93, %xmm2, %xmm2
-	vpslld $7, %xmm1, %xmm4
-	vpsrld $25, %xmm1, %xmm1
-	vpxor %xmm1, %xmm4, %xmm1
-	subq $2, %rax
-	jnz .Lchacha_blocks_avx2_mainloop3
-	vpaddd %xmm0, %xmm8, %xmm0
-	vpaddd %xmm1, %xmm9, %xmm1
-	vpaddd %xmm2, %xmm10, %xmm2
-	vpaddd %xmm3, %xmm11, %xmm3
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_avx2_noinput4
-	vpxor 0(%rsi), %xmm0, %xmm0
-	vpxor 16(%rsi), %xmm1, %xmm1
-	vpxor 32(%rsi), %xmm2, %xmm2
-	vpxor 48(%rsi), %xmm3, %xmm3
-	addq $64, %rsi
-.Lchacha_blocks_avx2_noinput4:
-	vmovdqu %xmm0, 0(%rdx)
-	vmovdqu %xmm1, 16(%rdx)
-	vmovdqu %xmm2, 32(%rdx)
-	vmovdqu %xmm3, 48(%rdx)
-	vpaddq %xmm11, %xmm5, %xmm11
-	cmpq $64, %rcx
-	jbe .Lchacha_blocks_avx2_mainloop3_finishup
-	addq $64, %rdx
-	subq $64, %rcx
-	jmp .Lchacha_blocks_avx2_below256
-.Lchacha_blocks_avx2_mainloop3_finishup:
-	cmpq $64, %rcx
-	je .Lchacha_blocks_avx2_done
-	addq %rcx, %r9
-	addq %rcx, %rdx
-	negq %rcx
-.Lchacha_blocks_avx2_copyoutput:
-	movb (%rdx, %rcx), %al
-	movb %al, (%r9, %rcx)
-	incq %rcx
-	jnz .Lchacha_blocks_avx2_copyoutput
-.Lchacha_blocks_avx2_done:
-	vmovdqu %xmm11, 48(%rdi)
-	movq %rbp, %rsp
-	popq %r14
-	popq %r13
-	popq %r12
-	popq %rbp
-	popq %rbx
-	vzeroall
-	movl $(63 + 512), %eax
-	ret
-ELF(.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;)
-
-.align 16
-.LC:
-.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13       /* pshufb rotate by 16 */
-.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14       /* pshufb rotate by 8 */
-
-#endif /*defined(USE_CHACHA20)*/
-#endif /*__x86_64*/
diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S
deleted file mode 100644
index 2b9842c1..00000000
--- a/cipher/chacha20-sse2-amd64.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* chacha20-sse2-amd64.S  -  AMD64/SSE2 implementation of ChaCha20
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
- */
-
-#ifdef __x86_64__
-#include <config.h>
-
-#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && USE_CHACHA20
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-.text
-
-.align 8
-.globl _gcry_chacha20_amd64_sse2_blocks
-ELF(.type  _gcry_chacha20_amd64_sse2_blocks,@function;)
-_gcry_chacha20_amd64_sse2_blocks:
-.Lchacha_blocks_sse2_local:
-	pushq %rbx
-	pushq %rbp
-	movq %rsp, %rbp
-	andq $~63, %rsp
-	subq $512, %rsp
-	movdqu (%rdi), %xmm8
-	movdqu 16(%rdi), %xmm9
-	movdqu 32(%rdi), %xmm10
-	movdqu 48(%rdi), %xmm11
-	movq $20, %rax
-	movq $1, %r9
-	movdqa %xmm8, 0(%rsp)
-	movdqa %xmm9, 16(%rsp)
-	movdqa %xmm10, 32(%rsp)
-	movdqa %xmm11, 48(%rsp)
-	movq %rax, 64(%rsp)
-	cmpq $256, %rcx
-	jb .Lchacha_blocks_sse2_below256
-	pshufd $0x00, %xmm8, %xmm0
-	pshufd $0x55, %xmm8, %xmm1
-	pshufd $0xaa, %xmm8, %xmm2
-	pshufd $0xff, %xmm8, %xmm3
-	movdqa %xmm0, 128(%rsp)
-	movdqa %xmm1, 144(%rsp)
-	movdqa %xmm2, 160(%rsp)
-	movdqa %xmm3, 176(%rsp)
-	pshufd $0x00, %xmm9, %xmm0
-	pshufd $0x55, %xmm9, %xmm1
-	pshufd $0xaa, %xmm9, %xmm2
-	pshufd $0xff, %xmm9, %xmm3
-	movdqa %xmm0, 192(%rsp)
-	movdqa %xmm1, 208(%rsp)
-	movdqa %xmm2, 224(%rsp)
-	movdqa %xmm3, 240(%rsp)
-	pshufd $0x00, %xmm10, %xmm0
-	pshufd $0x55, %xmm10, %xmm1
-	pshufd $0xaa, %xmm10, %xmm2
-	pshufd $0xff, %xmm10, %xmm3
-	movdqa %xmm0, 256(%rsp)
-	movdqa %xmm1, 272(%rsp)
-	movdqa %xmm2, 288(%rsp)
-	movdqa %xmm3, 304(%rsp)
-	pshufd $0xaa, %xmm11, %xmm0
-	pshufd $0xff, %xmm11, %xmm1
-	movdqa %xmm0, 352(%rsp)
-	movdqa %xmm1, 368(%rsp)
-	jmp .Lchacha_blocks_sse2_atleast256
-.p2align 6,,63
-.Lchacha_blocks_sse2_atleast256:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	movl %eax, 320(%rsp)
-	movl %r8d, 4+320(%rsp)
-	movl %r9d, 8+320(%rsp)
-	movl %r10d, 12+320(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	movl %eax, 336(%rsp)
-	movl %r8d, 4+336(%rsp)
-	movl %r9d, 8+336(%rsp)
-	movl %r10d, 12+336(%rsp)
-	movq %rbx, 48(%rsp)
-	movq 64(%rsp), %rax
-	movdqa 128(%rsp), %xmm0
-	movdqa 144(%rsp), %xmm1
-	movdqa 160(%rsp), %xmm2
-	movdqa 176(%rsp), %xmm3
-	movdqa 192(%rsp), %xmm4
-	movdqa 208(%rsp), %xmm5
-	movdqa 224(%rsp), %xmm6
-	movdqa 240(%rsp), %xmm7
-	movdqa 256(%rsp), %xmm8
-	movdqa 272(%rsp), %xmm9
-	movdqa 288(%rsp), %xmm10
-	movdqa 304(%rsp), %xmm11
-	movdqa 320(%rsp), %xmm12
-	movdqa 336(%rsp), %xmm13
-	movdqa 352(%rsp), %xmm14
-	movdqa 368(%rsp), %xmm15
-.Lchacha_blocks_sse2_mainloop1:
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	movdqa %xmm6, 96(%rsp)
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	pshuflw $0xb1,%xmm12,%xmm12
-	pshufhw $0xb1,%xmm12,%xmm12
-	pshuflw $0xb1,%xmm13,%xmm13
-	pshufhw $0xb1,%xmm13,%xmm13
-	pshuflw $0xb1,%xmm14,%xmm14
-	pshufhw $0xb1,%xmm14,%xmm14
-	pshuflw $0xb1,%xmm15,%xmm15
-	pshufhw $0xb1,%xmm15,%xmm15
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa 96(%rsp), %xmm6
-	movdqa %xmm4, %xmm12
-	pslld $ 12, %xmm4
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 12, %xmm5
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 12, %xmm6
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 12, %xmm7
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	movdqa %xmm6, 96(%rsp)
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	movdqa %xmm12, %xmm6
-	pslld $ 8, %xmm12
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm12
-	movdqa %xmm13, %xmm6
-	pslld $ 8, %xmm13
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm13
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	movdqa %xmm14, %xmm6
-	pslld $ 8, %xmm14
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm14
-	movdqa %xmm15, %xmm6
-	pslld $ 8, %xmm15
-	psrld $24, %xmm6
-	pxor %xmm6, %xmm15
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa 96(%rsp), %xmm6
-	movdqa %xmm4, %xmm12
-	pslld $ 7, %xmm4
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 7, %xmm5
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 7, %xmm6
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 7, %xmm7
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	movdqa %xmm7, 96(%rsp)
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	pshuflw $0xb1,%xmm15,%xmm15
-	pshufhw $0xb1,%xmm15,%xmm15
-	pshuflw $0xb1,%xmm12,%xmm12
-	pshufhw $0xb1,%xmm12,%xmm12
-	pshuflw $0xb1,%xmm13,%xmm13
-	pshufhw $0xb1,%xmm13,%xmm13
-	pshuflw $0xb1,%xmm14,%xmm14
-	pshufhw $0xb1,%xmm14,%xmm14
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa 96(%rsp), %xmm7
-	movdqa %xmm5, %xmm15
-	pslld $ 12, %xmm5
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 12, %xmm6
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 12, %xmm7
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 12, %xmm4
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm4
-	movdqa 112(%rsp), %xmm15
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	movdqa %xmm7, 96(%rsp)
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	movdqa %xmm15, %xmm7
-	pslld $ 8, %xmm15
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm15
-	movdqa %xmm12, %xmm7
-	pslld $ 8, %xmm12
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm12
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	movdqa %xmm13, %xmm7
-	pslld $ 8, %xmm13
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm13
-	movdqa %xmm14, %xmm7
-	pslld $ 8, %xmm14
-	psrld $24, %xmm7
-	pxor %xmm7, %xmm14
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa 96(%rsp), %xmm7
-	movdqa %xmm5, %xmm15
-	pslld $ 7, %xmm5
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 7, %xmm6
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 7, %xmm7
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 7, %xmm4
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm4
-	movdqa 112(%rsp), %xmm15
-	subq $2, %rax
-	jnz .Lchacha_blocks_sse2_mainloop1
-	paddd 128(%rsp), %xmm0
-	paddd 144(%rsp), %xmm1
-	paddd 160(%rsp), %xmm2
-	paddd 176(%rsp), %xmm3
-	paddd 192(%rsp), %xmm4
-	paddd 208(%rsp), %xmm5
-	paddd 224(%rsp), %xmm6
-	paddd 240(%rsp), %xmm7
-	paddd 256(%rsp), %xmm8
-	paddd 272(%rsp), %xmm9
-	paddd 288(%rsp), %xmm10
-	paddd 304(%rsp), %xmm11
-	paddd 320(%rsp), %xmm12
-	paddd 336(%rsp), %xmm13
-	paddd 352(%rsp), %xmm14
-	paddd 368(%rsp), %xmm15
-	movdqa %xmm8, 384(%rsp)
-	movdqa %xmm9, 400(%rsp)
-	movdqa %xmm10, 416(%rsp)
-	movdqa %xmm11, 432(%rsp)
-	movdqa %xmm12, 448(%rsp)
-	movdqa %xmm13, 464(%rsp)
-	movdqa %xmm14, 480(%rsp)
-	movdqa %xmm15, 496(%rsp)
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	movdqa %xmm0, %xmm1
-	movdqa %xmm4, %xmm3
-	movdqa %xmm8, %xmm5
-	movdqa %xmm10, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm1
-	punpcklqdq %xmm6, %xmm3
-	punpcklqdq %xmm9, %xmm5
-	punpcklqdq %xmm11, %xmm7
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_sse2_noinput1
-	movdqu 0(%rsi), %xmm2
-	movdqu 16(%rsi), %xmm6
-	movdqu 64(%rsi), %xmm9
-	movdqu 80(%rsi), %xmm11
-	movdqu 128(%rsi), %xmm12
-	movdqu 144(%rsi), %xmm13
-	movdqu 192(%rsi), %xmm14
-	movdqu 208(%rsi), %xmm15
-	pxor %xmm2, %xmm5
-	pxor %xmm6, %xmm7
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm1
-	pxor %xmm13, %xmm3
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu 32(%rsi), %xmm2
-	movdqu 48(%rsi), %xmm6
-	movdqu 96(%rsi), %xmm9
-	movdqu 112(%rsi), %xmm11
-	movdqu 160(%rsi), %xmm12
-	movdqu 176(%rsi), %xmm13
-	movdqu 224(%rsi), %xmm14
-	movdqu 240(%rsi), %xmm15
-	pxor %xmm2, %xmm1
-	pxor %xmm6, %xmm5
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm3
-	pxor %xmm13, %xmm7
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-	addq $256, %rsi
-	jmp .Lchacha_blocks_sse2_mainloop_cont
-.Lchacha_blocks_sse2_noinput1:
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-.Lchacha_blocks_sse2_mainloop_cont:
-	addq $256, %rdx
-	subq $256, %rcx
-	cmp $256, %rcx
-	jae .Lchacha_blocks_sse2_atleast256
-	movdqa 0(%rsp), %xmm8
-	movdqa 16(%rsp), %xmm9
-	movdqa 32(%rsp), %xmm10
-	movdqa 48(%rsp), %xmm11
-	movq $1, %r9
-.Lchacha_blocks_sse2_below256:
-	movq %r9, %xmm5
-	andq %rcx, %rcx
-	jz .Lchacha_blocks_sse2_done
-	cmpq $64, %rcx
-	jae .Lchacha_blocks_sse2_above63
-	movq %rdx, %r9
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_sse2_noinput2
-	movq %rcx, %r10
-	movq %rsp, %rdx
-	addq %r10, %rsi
-	addq %r10, %rdx
-	negq %r10
-.Lchacha_blocks_sse2_copyinput:
-	movb (%rsi, %r10), %al
-	movb %al, (%rdx, %r10)
-	incq %r10
-	jnz .Lchacha_blocks_sse2_copyinput
-	movq %rsp, %rsi
-.Lchacha_blocks_sse2_noinput2:
-	movq %rsp, %rdx
-.Lchacha_blocks_sse2_above63:
-	movdqa %xmm8, %xmm0
-	movdqa %xmm9, %xmm1
-	movdqa %xmm10, %xmm2
-	movdqa %xmm11, %xmm3
-	movq 64(%rsp), %rax
-.Lchacha_blocks_sse2_mainloop2:
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshuflw $0xb1,%xmm3,%xmm3
-	pshufhw $0xb1,%xmm3,%xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1,%xmm4
-	pslld $12, %xmm1
-	psrld $20, %xmm4
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	movdqa %xmm3,%xmm4
-	pslld $8, %xmm3
-	psrld $24, %xmm4
-	pshufd $0x93,%xmm0,%xmm0
-	pxor %xmm4, %xmm3
-	paddd %xmm3, %xmm2
-	pshufd $0x4e,%xmm3,%xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x39,%xmm2,%xmm2
-	movdqa %xmm1,%xmm4
-	pslld $7, %xmm1
-	psrld $25, %xmm4
-	pxor %xmm4, %xmm1
-	subq $2, %rax
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshuflw $0xb1,%xmm3,%xmm3
-	pshufhw $0xb1,%xmm3,%xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1,%xmm4
-	pslld $12, %xmm1
-	psrld $20, %xmm4
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	movdqa %xmm3,%xmm4
-	pslld $8, %xmm3
-	psrld $24, %xmm4
-	pshufd $0x39,%xmm0,%xmm0
-	pxor %xmm4, %xmm3
-	paddd %xmm3, %xmm2
-	pshufd $0x4e,%xmm3,%xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x93,%xmm2,%xmm2
-	movdqa %xmm1,%xmm4
-	pslld $7, %xmm1
-	psrld $25, %xmm4
-	pxor %xmm4, %xmm1
-	jnz .Lchacha_blocks_sse2_mainloop2
-	paddd %xmm8, %xmm0
-	paddd %xmm9, %xmm1
-	paddd %xmm10, %xmm2
-	paddd %xmm11, %xmm3
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_sse2_noinput3
-	movdqu 0(%rsi), %xmm12
-	movdqu 16(%rsi), %xmm13
-	movdqu 32(%rsi), %xmm14
-	movdqu 48(%rsi), %xmm15
-	pxor %xmm12, %xmm0
-	pxor %xmm13, %xmm1
-	pxor %xmm14, %xmm2
-	pxor %xmm15, %xmm3
-	addq $64, %rsi
-.Lchacha_blocks_sse2_noinput3:
-	movdqu %xmm0, 0(%rdx)
-	movdqu %xmm1, 16(%rdx)
-	movdqu %xmm2, 32(%rdx)
-	movdqu %xmm3, 48(%rdx)
-	paddq %xmm5, %xmm11
-	cmpq $64, %rcx
-	jbe .Lchacha_blocks_sse2_mainloop2_finishup
-	addq $64, %rdx
-	subq $64, %rcx
-	jmp .Lchacha_blocks_sse2_below256
-.Lchacha_blocks_sse2_mainloop2_finishup:
-	cmpq $64, %rcx
-	je .Lchacha_blocks_sse2_done
-	addq %rcx, %r9
-	addq %rcx, %rdx
-	negq %rcx
-.Lchacha_blocks_sse2_copyoutput:
-	movb (%rdx, %rcx), %al
-	movb %al, (%r9, %rcx)
-	incq %rcx
-	jnz .Lchacha_blocks_sse2_copyoutput
-.Lchacha_blocks_sse2_done:
-	movdqu %xmm11, 48(%rdi)
-	movq %rbp, %rsp
-	pxor %xmm15, %xmm15
-	pxor %xmm7, %xmm7
-	pxor %xmm14, %xmm14
-	pxor %xmm6, %xmm6
-	pxor %xmm13, %xmm13
-	pxor %xmm5, %xmm5
-	pxor %xmm12, %xmm12
-	pxor %xmm4, %xmm4
-	popq %rbp
-	popq %rbx
-	movl $(63 + 512 + 16), %eax
-	pxor %xmm11, %xmm11
-	pxor %xmm3, %xmm3
-	pxor %xmm10, %xmm10
-	pxor %xmm2, %xmm2
-	pxor %xmm9, %xmm9
-	pxor %xmm1, %xmm1
-	pxor %xmm8, %xmm8
-	pxor %xmm0, %xmm0
-	ret
-ELF(.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;)
-
-#endif /*defined(USE_CHACHA20)*/
-#endif /*__x86_64*/
diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S
deleted file mode 100644
index c04010e7..00000000
--- a/cipher/chacha20-ssse3-amd64.S
+++ /dev/null
@@ -1,632 +0,0 @@
-/* chacha20-ssse3-amd64.S  -  AMD64/SSSE3 implementation of ChaCha20
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/chacha-opt
- */
-
-#ifdef __x86_64__
-#include <config.h>
-
-#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-.text
-
-.align 8
-.globl _gcry_chacha20_amd64_ssse3_blocks
-ELF(.type  _gcry_chacha20_amd64_ssse3_blocks,@function;)
-_gcry_chacha20_amd64_ssse3_blocks:
-.Lchacha_blocks_ssse3_local:
-	pushq %rbx
-	pushq %rbp
-	movq %rsp, %rbp
-	andq $~63, %rsp
-	subq $512, %rsp
-	leaq .LC RIP, %rax
-	movdqa 0(%rax), %xmm6
-	movdqa 16(%rax), %xmm7
-	movdqu 0(%rdi), %xmm8
-	movdqu 16(%rdi), %xmm9
-	movdqu 32(%rdi), %xmm10
-	movdqu 48(%rdi), %xmm11
-	movl $20, %eax
-	movq $1, %r9
-	movdqa %xmm8, 0(%rsp)
-	movdqa %xmm9, 16(%rsp)
-	movdqa %xmm10, 32(%rsp)
-	movdqa %xmm11, 48(%rsp)
-	movdqa %xmm6, 80(%rsp)
-	movdqa %xmm7, 96(%rsp)
-	movq %rax, 64(%rsp)
-	cmpq $256, %rcx
-	jb .Lchacha_blocks_ssse3_below256
-	pshufd $0x00, %xmm8, %xmm0
-	pshufd $0x55, %xmm8, %xmm1
-	pshufd $0xaa, %xmm8, %xmm2
-	pshufd $0xff, %xmm8, %xmm3
-	movdqa %xmm0, 128(%rsp)
-	movdqa %xmm1, 144(%rsp)
-	movdqa %xmm2, 160(%rsp)
-	movdqa %xmm3, 176(%rsp)
-	pshufd $0x00, %xmm9, %xmm0
-	pshufd $0x55, %xmm9, %xmm1
-	pshufd $0xaa, %xmm9, %xmm2
-	pshufd $0xff, %xmm9, %xmm3
-	movdqa %xmm0, 192(%rsp)
-	movdqa %xmm1, 208(%rsp)
-	movdqa %xmm2, 224(%rsp)
-	movdqa %xmm3, 240(%rsp)
-	pshufd $0x00, %xmm10, %xmm0
-	pshufd $0x55, %xmm10, %xmm1
-	pshufd $0xaa, %xmm10, %xmm2
-	pshufd $0xff, %xmm10, %xmm3
-	movdqa %xmm0, 256(%rsp)
-	movdqa %xmm1, 272(%rsp)
-	movdqa %xmm2, 288(%rsp)
-	movdqa %xmm3, 304(%rsp)
-	pshufd $0xaa, %xmm11, %xmm0
-	pshufd $0xff, %xmm11, %xmm1
-	movdqa %xmm0, 352(%rsp)
-	movdqa %xmm1, 368(%rsp)
-	jmp .Lchacha_blocks_ssse3_atleast256
-.p2align 6,,63
-	# align to 4 mod 64
-	nop;nop;nop;nop;
-.Lchacha_blocks_ssse3_atleast256:
-	movq 48(%rsp), %rax
-	leaq 1(%rax), %r8
-	leaq 2(%rax), %r9
-	leaq 3(%rax), %r10
-	leaq 4(%rax), %rbx
-	movl %eax, 320(%rsp)
-	movl %r8d, 4+320(%rsp)
-	movl %r9d, 8+320(%rsp)
-	movl %r10d, 12+320(%rsp)
-	shrq $32, %rax
-	shrq $32, %r8
-	shrq $32, %r9
-	shrq $32, %r10
-	movl %eax, 336(%rsp)
-	movl %r8d, 4+336(%rsp)
-	movl %r9d, 8+336(%rsp)
-	movl %r10d, 12+336(%rsp)
-	movq %rbx, 48(%rsp)
-	movq 64(%rsp), %rax
-	movdqa 128(%rsp), %xmm0
-	movdqa 144(%rsp), %xmm1
-	movdqa 160(%rsp), %xmm2
-	movdqa 176(%rsp), %xmm3
-	movdqa 192(%rsp), %xmm4
-	movdqa 208(%rsp), %xmm5
-	movdqa 224(%rsp), %xmm6
-	movdqa 240(%rsp), %xmm7
-	movdqa 256(%rsp), %xmm8
-	movdqa 272(%rsp), %xmm9
-	movdqa 288(%rsp), %xmm10
-	movdqa 304(%rsp), %xmm11
-	movdqa 320(%rsp), %xmm12
-	movdqa 336(%rsp), %xmm13
-	movdqa 352(%rsp), %xmm14
-	movdqa 368(%rsp), %xmm15
-.Lchacha_blocks_ssse3_mainloop1:
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	pshufb 80(%rsp), %xmm12
-	pshufb 80(%rsp), %xmm13
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	pshufb 80(%rsp), %xmm14
-	pshufb 80(%rsp), %xmm15
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa %xmm4, %xmm12
-	pslld $ 12, %xmm4
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 12, %xmm5
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 12, %xmm6
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 12, %xmm7
-	psrld $20, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm4, %xmm0
-	paddd %xmm5, %xmm1
-	pxor %xmm0, %xmm12
-	pxor %xmm1, %xmm13
-	paddd %xmm6, %xmm2
-	paddd %xmm7, %xmm3
-	pxor %xmm2, %xmm14
-	pxor %xmm3, %xmm15
-	pshufb 96(%rsp), %xmm12
-	pshufb 96(%rsp), %xmm13
-	paddd %xmm12, %xmm8
-	paddd %xmm13, %xmm9
-	pshufb 96(%rsp), %xmm14
-	pshufb 96(%rsp), %xmm15
-	paddd %xmm14, %xmm10
-	paddd %xmm15, %xmm11
-	movdqa %xmm12, 112(%rsp)
-	pxor %xmm8, %xmm4
-	pxor %xmm9, %xmm5
-	movdqa %xmm4, %xmm12
-	pslld $ 7, %xmm4
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm4
-	movdqa %xmm5, %xmm12
-	pslld $ 7, %xmm5
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm5
-	pxor %xmm10, %xmm6
-	pxor %xmm11, %xmm7
-	movdqa %xmm6, %xmm12
-	pslld $ 7, %xmm6
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm6
-	movdqa %xmm7, %xmm12
-	pslld $ 7, %xmm7
-	psrld $25, %xmm12
-	pxor %xmm12, %xmm7
-	movdqa 112(%rsp), %xmm12
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	pshufb 80(%rsp), %xmm15
-	pshufb 80(%rsp), %xmm12
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	pshufb 80(%rsp), %xmm13
-	pshufb 80(%rsp), %xmm14
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa %xmm5, %xmm15
-	pslld $ 12, %xmm5
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 12, %xmm6
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 12, %xmm7
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 12, %xmm4
-	psrld $20, %xmm15
-	pxor %xmm15, %xmm4
-	movdqa 112(%rsp), %xmm15
-	paddd %xmm5, %xmm0
-	paddd %xmm6, %xmm1
-	pxor %xmm0, %xmm15
-	pxor %xmm1, %xmm12
-	paddd %xmm7, %xmm2
-	paddd %xmm4, %xmm3
-	pxor %xmm2, %xmm13
-	pxor %xmm3, %xmm14
-	pshufb 96(%rsp), %xmm15
-	pshufb 96(%rsp), %xmm12
-	paddd %xmm15, %xmm10
-	paddd %xmm12, %xmm11
-	pshufb 96(%rsp), %xmm13
-	pshufb 96(%rsp), %xmm14
-	paddd %xmm13, %xmm8
-	paddd %xmm14, %xmm9
-	movdqa %xmm15, 112(%rsp)
-	pxor %xmm10, %xmm5
-	pxor %xmm11, %xmm6
-	movdqa %xmm5, %xmm15
-	pslld $ 7, %xmm5
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm5
-	movdqa %xmm6, %xmm15
-	pslld $ 7, %xmm6
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm6
-	pxor %xmm8, %xmm7
-	pxor %xmm9, %xmm4
-	movdqa %xmm7, %xmm15
-	pslld $ 7, %xmm7
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm7
-	movdqa %xmm4, %xmm15
-	pslld $ 7, %xmm4
-	psrld $25, %xmm15
-	pxor %xmm15, %xmm4
-	subq $2, %rax
-	movdqa 112(%rsp), %xmm15
-	jnz .Lchacha_blocks_ssse3_mainloop1
-	paddd 128(%rsp), %xmm0
-	paddd 144(%rsp), %xmm1
-	paddd 160(%rsp), %xmm2
-	paddd 176(%rsp), %xmm3
-	paddd 192(%rsp), %xmm4
-	paddd 208(%rsp), %xmm5
-	paddd 224(%rsp), %xmm6
-	paddd 240(%rsp), %xmm7
-	paddd 256(%rsp), %xmm8
-	paddd 272(%rsp), %xmm9
-	paddd 288(%rsp), %xmm10
-	paddd 304(%rsp), %xmm11
-	paddd 320(%rsp), %xmm12
-	paddd 336(%rsp), %xmm13
-	paddd 352(%rsp), %xmm14
-	paddd 368(%rsp), %xmm15
-	movdqa %xmm8, 384(%rsp)
-	movdqa %xmm9, 400(%rsp)
-	movdqa %xmm10, 416(%rsp)
-	movdqa %xmm11, 432(%rsp)
-	movdqa %xmm12, 448(%rsp)
-	movdqa %xmm13, 464(%rsp)
-	movdqa %xmm14, 480(%rsp)
-	movdqa %xmm15, 496(%rsp)
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	movdqa %xmm0, %xmm1
-	movdqa %xmm4, %xmm3
-	movdqa %xmm8, %xmm5
-	movdqa %xmm10, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm1
-	punpcklqdq %xmm6, %xmm3
-	punpcklqdq %xmm9, %xmm5
-	punpcklqdq %xmm11, %xmm7
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_ssse3_noinput1
-	movdqu 0(%rsi), %xmm2
-	movdqu 16(%rsi), %xmm6
-	movdqu 64(%rsi), %xmm9
-	movdqu 80(%rsi), %xmm11
-	movdqu 128(%rsi), %xmm12
-	movdqu 144(%rsi), %xmm13
-	movdqu 192(%rsi), %xmm14
-	movdqu 208(%rsi), %xmm15
-	pxor %xmm2, %xmm5
-	pxor %xmm6, %xmm7
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm1
-	pxor %xmm13, %xmm3
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu 32(%rsi), %xmm2
-	movdqu 48(%rsi), %xmm6
-	movdqu 96(%rsi), %xmm9
-	movdqu 112(%rsi), %xmm11
-	movdqu 160(%rsi), %xmm12
-	movdqu 176(%rsi), %xmm13
-	movdqu 224(%rsi), %xmm14
-	movdqu 240(%rsi), %xmm15
-	pxor %xmm2, %xmm1
-	pxor %xmm6, %xmm5
-	pxor %xmm9, %xmm8
-	pxor %xmm11, %xmm10
-	pxor %xmm12, %xmm3
-	pxor %xmm13, %xmm7
-	pxor %xmm14, %xmm0
-	pxor %xmm15, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-	addq $256, %rsi
-	jmp .Lchacha_blocks_ssse3_mainloop_cont
-.Lchacha_blocks_ssse3_noinput1:
-	movdqu %xmm5, 0(%rdx)
-	movdqu %xmm7, 16(%rdx)
-	movdqu %xmm8, 64(%rdx)
-	movdqu %xmm10, 80(%rdx)
-	movdqu %xmm1, 128(%rdx)
-	movdqu %xmm3, 144(%rdx)
-	movdqu %xmm0, 192(%rdx)
-	movdqu %xmm4, 208(%rdx)
-	movdqa 384(%rsp), %xmm0
-	movdqa 400(%rsp), %xmm1
-	movdqa 416(%rsp), %xmm2
-	movdqa 432(%rsp), %xmm3
-	movdqa 448(%rsp), %xmm4
-	movdqa 464(%rsp), %xmm5
-	movdqa 480(%rsp), %xmm6
-	movdqa 496(%rsp), %xmm7
-	movdqa %xmm0, %xmm8
-	movdqa %xmm2, %xmm9
-	movdqa %xmm4, %xmm10
-	movdqa %xmm6, %xmm11
-	punpckldq %xmm1, %xmm8
-	punpckldq %xmm3, %xmm9
-	punpckhdq %xmm1, %xmm0
-	punpckhdq %xmm3, %xmm2
-	punpckldq %xmm5, %xmm10
-	punpckldq %xmm7, %xmm11
-	punpckhdq %xmm5, %xmm4
-	punpckhdq %xmm7, %xmm6
-	movdqa %xmm8, %xmm1
-	movdqa %xmm0, %xmm3
-	movdqa %xmm10, %xmm5
-	movdqa %xmm4, %xmm7
-	punpcklqdq %xmm9, %xmm1
-	punpcklqdq %xmm11, %xmm5
-	punpckhqdq %xmm9, %xmm8
-	punpckhqdq %xmm11, %xmm10
-	punpcklqdq %xmm2, %xmm3
-	punpcklqdq %xmm6, %xmm7
-	punpckhqdq %xmm2, %xmm0
-	punpckhqdq %xmm6, %xmm4
-	movdqu %xmm1, 32(%rdx)
-	movdqu %xmm5, 48(%rdx)
-	movdqu %xmm8, 96(%rdx)
-	movdqu %xmm10, 112(%rdx)
-	movdqu %xmm3, 160(%rdx)
-	movdqu %xmm7, 176(%rdx)
-	movdqu %xmm0, 224(%rdx)
-	movdqu %xmm4, 240(%rdx)
-.Lchacha_blocks_ssse3_mainloop_cont:
-	addq $256, %rdx
-	subq $256, %rcx
-	cmp $256, %rcx
-	jae .Lchacha_blocks_ssse3_atleast256
-	movdqa 80(%rsp), %xmm6
-	movdqa 96(%rsp), %xmm7
-	movdqa 0(%rsp), %xmm8
-	movdqa 16(%rsp), %xmm9
-	movdqa 32(%rsp), %xmm10
-	movdqa 48(%rsp), %xmm11
-	movq $1, %r9
-.Lchacha_blocks_ssse3_below256:
-	movq %r9, %xmm5
-	andq %rcx, %rcx
-	jz .Lchacha_blocks_ssse3_done
-	cmpq $64, %rcx
-	jae .Lchacha_blocks_ssse3_above63
-	movq %rdx, %r9
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_ssse3_noinput2
-	movq %rcx, %r10
-	movq %rsp, %rdx
-	addq %r10, %rsi
-	addq %r10, %rdx
-	negq %r10
-.Lchacha_blocks_ssse3_copyinput:
-	movb (%rsi, %r10), %al
-	movb %al, (%rdx, %r10)
-	incq %r10
-	jnz .Lchacha_blocks_ssse3_copyinput
-	movq %rsp, %rsi
-.Lchacha_blocks_ssse3_noinput2:
-	movq %rsp, %rdx
-.Lchacha_blocks_ssse3_above63:
-	movdqa %xmm8, %xmm0
-	movdqa %xmm9, %xmm1
-	movdqa %xmm10, %xmm2
-	movdqa %xmm11, %xmm3
-	movq 64(%rsp), %rax
-.Lchacha_blocks_ssse3_mainloop2:
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm6, %xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1, %xmm4
-	pslld $12, %xmm4
-	psrld $20, %xmm1
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm7, %xmm3
-	pshufd $0x93, %xmm0, %xmm0
-	paddd %xmm3, %xmm2
-	pshufd $0x4e, %xmm3, %xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x39, %xmm2, %xmm2
-	movdqa %xmm1, %xmm4
-	pslld $7, %xmm4
-	psrld $25, %xmm1
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm6, %xmm3
-	paddd %xmm3, %xmm2
-	pxor %xmm2, %xmm1
-	movdqa %xmm1, %xmm4
-	pslld $12, %xmm4
-	psrld $20, %xmm1
-	pxor %xmm4, %xmm1
-	paddd %xmm1, %xmm0
-	pxor %xmm0, %xmm3
-	pshufb %xmm7, %xmm3
-	pshufd $0x39, %xmm0, %xmm0
-	paddd %xmm3, %xmm2
-	pshufd $0x4e, %xmm3, %xmm3
-	pxor %xmm2, %xmm1
-	pshufd $0x93, %xmm2, %xmm2
-	movdqa %xmm1, %xmm4
-	pslld $7, %xmm4
-	psrld $25, %xmm1
-	pxor %xmm4, %xmm1
-	subq $2, %rax
-	jnz .Lchacha_blocks_ssse3_mainloop2
-	paddd %xmm8, %xmm0
-	paddd %xmm9, %xmm1
-	paddd %xmm10, %xmm2
-	paddd %xmm11, %xmm3
-	andq %rsi, %rsi
-	jz .Lchacha_blocks_ssse3_noinput3
-	movdqu 0(%rsi), %xmm12
-	movdqu 16(%rsi), %xmm13
-	movdqu 32(%rsi), %xmm14
-	movdqu 48(%rsi), %xmm15
-	pxor %xmm12, %xmm0
-	pxor %xmm13, %xmm1
-	pxor %xmm14, %xmm2
-	pxor %xmm15, %xmm3
-	addq $64, %rsi
-.Lchacha_blocks_ssse3_noinput3:
-	movdqu %xmm0, 0(%rdx)
-	movdqu %xmm1, 16(%rdx)
-	movdqu %xmm2, 32(%rdx)
-	movdqu %xmm3, 48(%rdx)
-	paddq %xmm5, %xmm11
-	cmpq $64, %rcx
-	jbe .Lchacha_blocks_ssse3_mainloop2_finishup
-	addq $64, %rdx
-	subq $64, %rcx
-	jmp .Lchacha_blocks_ssse3_below256
-.Lchacha_blocks_ssse3_mainloop2_finishup:
-	cmpq $64, %rcx
-	je .Lchacha_blocks_ssse3_done
-	addq %rcx, %r9
-	addq %rcx, %rdx
-	negq %rcx
-.Lchacha_blocks_ssse3_copyoutput:
-	movb (%rdx, %rcx), %al
-	movb %al, (%r9, %rcx)
-	incq %rcx
-	jnz .Lchacha_blocks_ssse3_copyoutput
-.Lchacha_blocks_ssse3_done:
-	movdqu %xmm11, 48(%rdi)
-	movq %rbp, %rsp
-	pxor %xmm15, %xmm15
-	pxor %xmm7, %xmm7
-	pxor %xmm14, %xmm14
-	pxor %xmm6, %xmm6
-	pxor %xmm13, %xmm13
-	pxor %xmm5, %xmm5
-	pxor %xmm12, %xmm12
-	pxor %xmm4, %xmm4
-	popq %rbp
-	popq %rbx
-	movl $(63 + 512 + 16), %eax
-	pxor %xmm11, %xmm11
-	pxor %xmm3, %xmm3
-	pxor %xmm10, %xmm10
-	pxor %xmm2, %xmm2
-	pxor %xmm9, %xmm9
-	pxor %xmm1, %xmm1
-	pxor %xmm8, %xmm8
-	pxor %xmm0, %xmm0
-	ret
-ELF(.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;)
-
-.align 16;
-.LC:
-.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13       /* pshufb rotate by 16 */
-.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14       /* pshufb rotate by 8 */
-
-#endif /*defined(USE_CHACHA20)*/
-#endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 613fa82a..ac6cc29e 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,637 +1,583 @@
 /* chacha20.c  -  Bernstein's ChaCha20 cipher
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * For a description of the algorithm, see:
  *   http://cr.yp.to/chacha.html
  */
 
-/* The code is based on salsa20.c and public-domain ChaCha implementations:
- *  chacha-ref.c version 20080118
- *  D. J. Bernstein
- *  Public domain.
- * and
- *  Andrew Moon
- *  https://github.com/floodyberry/chacha-opt
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
  */
 
-
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 
 
 #define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
 #define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
 #define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
 #define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
 #define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
 #define CHACHA20_CTR_SIZE     16        /* Bytes.  */
-#define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4)
 
-/* USE_SSE2 indicates whether to compile with Intel SSE2 code. */
-#undef USE_SSE2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_SSE2 1
-#endif
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(HAVE_GCC_INLINE_ASM_SSSE3)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX2 1
 #endif
 
-/* USE_NEON indicates whether to enable ARM NEON assembly code. */
-#undef USE_NEON
+/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
+#undef USE_ARMV7_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
-#  define USE_NEON 1
+#  define USE_ARMV7_NEON 1
 # endif
-#endif /*ENABLE_NEON_SUPPORT*/
-
-
-struct CHACHA20_context_s;
-
+#endif
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if (defined(USE_SSE2) || defined(USE_SSSE3) || defined(USE_AVX2)) && \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
 # define ASM_FUNC_ABI __attribute__((sysv_abi))
-# define ASM_EXTRA_STACK (10 * 16)
 #else
 # define ASM_FUNC_ABI
-# define ASM_EXTRA_STACK 0
 #endif
 
 
-typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src,
-                                           byte *dst,
-                                           size_t bytes) ASM_FUNC_ABI;
-
 typedef struct CHACHA20_context_s
 {
-  u32 input[CHACHA20_INPUT_LENGTH];
-  u32 pad[CHACHA20_INPUT_LENGTH];
-  chacha20_blocks_t blocks;
+  u32 input[16];
+  unsigned char pad[CHACHA20_BLOCK_SIZE];
   unsigned int unused; /* bytes in the pad.  */
+  int use_ssse3:1;
+  int use_avx2:1;
+  int use_neon:1;
 } CHACHA20_context_t;
 
 
-#ifdef USE_SSE2
-
-unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in,
-                                              byte *out,
-                                              size_t bytes) ASM_FUNC_ABI;
-
-#endif /* USE_SSE2 */
-
 #ifdef USE_SSSE3
 
-unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in,
-                                               byte *out,
-                                               size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
+						const byte *src,
+						size_t nblks) ASM_FUNC_ABI;
 
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
 
-unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
-                                              byte *out,
-                                              size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
+					       const byte *src,
+					       size_t nblks) ASM_FUNC_ABI;
 
 #endif /* USE_AVX2 */
 
-#ifdef USE_NEON
+#ifdef USE_ARMV7_NEON
 
-unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
-                                              byte *out,
-                                              size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
+					       const byte *src,
+					       size_t nblks);
 
-#endif /* USE_NEON */
+#endif /* USE_ARMV7_NEON */
 
 
-static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
 static const char *selftest (void);
 
 
+#define ROTATE(v,c)	(rol(v,c))
+#define XOR(v,w)	((v) ^ (w))
+#define PLUS(v,w)	((u32)((v) + (w)))
+#define PLUSONE(v)	(PLUS((v),1))
 
-#define QROUND(a,b,c,d)         \
-  do {                          \
-    a += b; d = rol(d ^ a, 16); \
-    c += d; b = rol(b ^ c, 12); \
-    a += b; d = rol(d ^ a, 8);  \
-    c += d; b = rol(b ^ c, 7);  \
-  } while (0)
+#define QUARTERROUND(a,b,c,d) \
+  a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
+  c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
+  a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
+  c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
 
-#define QOUT(ai, bi, ci, di) \
-  DO_OUT(ai); DO_OUT(bi); DO_OUT(ci); DO_OUT(di)
+#define BUF_XOR_LE32(dst, src, offset, x) \
+  buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
 
-
-#ifndef USE_SSE2
-ASM_FUNC_ABI static unsigned int
-chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
+static unsigned int
+chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 {
-  u32 pad[CHACHA20_INPUT_LENGTH];
-  u32 inp[CHACHA20_INPUT_LENGTH];
+  u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
   unsigned int i;
 
-  /* Note: 'bytes' must be multiple of 64 and not zero. */
-
-  inp[0] = state[0];
-  inp[1] = state[1];
-  inp[2] = state[2];
-  inp[3] = state[3];
-  inp[4] = state[4];
-  inp[5] = state[5];
-  inp[6] = state[6];
-  inp[7] = state[7];
-  inp[8] = state[8];
-  inp[9] = state[9];
-  inp[10] = state[10];
-  inp[11] = state[11];
-  inp[12] = state[12];
-  inp[13] = state[13];
-  inp[14] = state[14];
-  inp[15] = state[15];
-
-  do
+  while (nblks)
     {
-      /* First round. */
-      pad[0] = inp[0];
-      pad[4] = inp[4];
-      pad[8] = inp[8];
-      pad[12] = inp[12];
-      QROUND (pad[0], pad[4], pad[8], pad[12]);
-      pad[1] = inp[1];
-      pad[5] = inp[5];
-      pad[9] = inp[9];
-      pad[13] = inp[13];
-      QROUND (pad[1], pad[5], pad[9], pad[13]);
-      pad[2] = inp[2];
-      pad[6] = inp[6];
-      pad[10] = inp[10];
-      pad[14] = inp[14];
-      QROUND (pad[2], pad[6], pad[10], pad[14]);
-      pad[3] = inp[3];
-      pad[7] = inp[7];
-      pad[11] = inp[11];
-      pad[15] = inp[15];
-      QROUND (pad[3], pad[7], pad[11], pad[15]);
-
-      QROUND (pad[0], pad[5], pad[10], pad[15]);
-      QROUND (pad[1], pad[6], pad[11], pad[12]);
-      QROUND (pad[2], pad[7], pad[8], pad[13]);
-      QROUND (pad[3], pad[4], pad[9], pad[14]);
-
-      for (i = 2; i < 20 - 2; i += 2)
-      {
-        QROUND (pad[0], pad[4], pad[8], pad[12]);
-        QROUND (pad[1], pad[5], pad[9], pad[13]);
-        QROUND (pad[2], pad[6], pad[10], pad[14]);
-        QROUND (pad[3], pad[7], pad[11], pad[15]);
-
-        QROUND (pad[0], pad[5], pad[10], pad[15]);
-        QROUND (pad[1], pad[6], pad[11], pad[12]);
-        QROUND (pad[2], pad[7], pad[8], pad[13]);
-        QROUND (pad[3], pad[4], pad[9], pad[14]);
-      }
-
-      QROUND (pad[0], pad[4], pad[8], pad[12]);
-      QROUND (pad[1], pad[5], pad[9], pad[13]);
-      QROUND (pad[2], pad[6], pad[10], pad[14]);
-      QROUND (pad[3], pad[7], pad[11], pad[15]);
-
-      if (src)
-        {
-#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, \
-                                 (pad[idx] + inp[idx]) ^ \
-                                  buf_get_le32(src + (idx) * 4))
-          /* Last round. */
-          QROUND (pad[0], pad[5], pad[10], pad[15]);
-          QOUT(0, 5, 10, 15);
-          QROUND (pad[1], pad[6], pad[11], pad[12]);
-          QOUT(1, 6, 11, 12);
-          QROUND (pad[2], pad[7], pad[8], pad[13]);
-          QOUT(2, 7, 8, 13);
-          QROUND (pad[3], pad[4], pad[9], pad[14]);
-          QOUT(3, 4, 9, 14);
-#undef DO_OUT
-        }
-      else
-        {
-#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, pad[idx] + inp[idx])
-          /* Last round. */
-          QROUND (pad[0], pad[5], pad[10], pad[15]);
-          QOUT(0, 5, 10, 15);
-          QROUND (pad[1], pad[6], pad[11], pad[12]);
-          QOUT(1, 6, 11, 12);
-          QROUND (pad[2], pad[7], pad[8], pad[13]);
-          QOUT(2, 7, 8, 13);
-          QROUND (pad[3], pad[4], pad[9], pad[14]);
-          QOUT(3, 4, 9, 14);
-#undef DO_OUT
-        }
-
-      /* Update counter. */
-      inp[13] += (!++inp[12]);
-
-      bytes -= CHACHA20_BLOCK_SIZE;
+      x0 = input[0];
+      x1 = input[1];
+      x2 = input[2];
+      x3 = input[3];
+      x4 = input[4];
+      x5 = input[5];
+      x6 = input[6];
+      x7 = input[7];
+      x8 = input[8];
+      x9 = input[9];
+      x10 = input[10];
+      x11 = input[11];
+      x12 = input[12];
+      x13 = input[13];
+      x14 = input[14];
+      x15 = input[15];
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND(x0, x4,  x8, x12)
+	  QUARTERROUND(x1, x5,  x9, x13)
+	  QUARTERROUND(x2, x6, x10, x14)
+	  QUARTERROUND(x3, x7, x11, x15)
+	  QUARTERROUND(x0, x5, x10, x15)
+	  QUARTERROUND(x1, x6, x11, x12)
+	  QUARTERROUND(x2, x7,  x8, x13)
+	  QUARTERROUND(x3, x4,  x9, x14)
+	}
+
+      x0 = PLUS(x0, input[0]);
+      x1 = PLUS(x1, input[1]);
+      x2 = PLUS(x2, input[2]);
+      x3 = PLUS(x3, input[3]);
+      x4 = PLUS(x4, input[4]);
+      x5 = PLUS(x5, input[5]);
+      x6 = PLUS(x6, input[6]);
+      x7 = PLUS(x7, input[7]);
+      x8 = PLUS(x8, input[8]);
+      x9 = PLUS(x9, input[9]);
+      x10 = PLUS(x10, input[10]);
+      x11 = PLUS(x11, input[11]);
+      x12 = PLUS(x12, input[12]);
+      x13 = PLUS(x13, input[13]);
+      x14 = PLUS(x14, input[14]);
+      x15 = PLUS(x15, input[15]);
+
+      input[12] = PLUSONE(input[12]);
+      input[13] = PLUS(input[13], !input[12]);
+
+      BUF_XOR_LE32(dst, src, 0, x0);
+      BUF_XOR_LE32(dst, src, 4, x1);
+      BUF_XOR_LE32(dst, src, 8, x2);
+      BUF_XOR_LE32(dst, src, 12, x3);
+      BUF_XOR_LE32(dst, src, 16, x4);
+      BUF_XOR_LE32(dst, src, 20, x5);
+      BUF_XOR_LE32(dst, src, 24, x6);
+      BUF_XOR_LE32(dst, src, 28, x7);
+      BUF_XOR_LE32(dst, src, 32, x8);
+      BUF_XOR_LE32(dst, src, 36, x9);
+      BUF_XOR_LE32(dst, src, 40, x10);
+      BUF_XOR_LE32(dst, src, 44, x11);
+      BUF_XOR_LE32(dst, src, 48, x12);
+      BUF_XOR_LE32(dst, src, 52, x13);
+      BUF_XOR_LE32(dst, src, 56, x14);
+      BUF_XOR_LE32(dst, src, 60, x15);
+
+      src += CHACHA20_BLOCK_SIZE;
       dst += CHACHA20_BLOCK_SIZE;
-      src += (src) ? CHACHA20_BLOCK_SIZE : 0;
+      nblks--;
     }
-  while (bytes >= CHACHA20_BLOCK_SIZE);
-
-  state[12] = inp[12];
-  state[13] = inp[13];
 
   /* burn_stack */
-  return (2 * CHACHA20_INPUT_LENGTH * sizeof(u32) + 6 * sizeof(void *));
-}
-#endif /*!USE_SSE2*/
-
-#undef QROUND
-#undef QOUT
-
-
-static unsigned int
-chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx)
-{
-  return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE)
-         + ASM_EXTRA_STACK;
+  return (17 * sizeof(u32) + 6 * sizeof(void *));
 }
 
 
 static void
-chacha20_keysetup (CHACHA20_context_t * ctx, const byte * key,
+chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
                    unsigned int keylen)
 {
-  /* These constants are the little endian encoding of the string
-     "expand 32-byte k".  For the 128 bit variant, the "32" in that
-     string will be fixed up to "16".  */
-  ctx->input[0] = 0x61707865;        /* "apxe"  */
-  ctx->input[1] = 0x3320646e;        /* "3 dn"  */
-  ctx->input[2] = 0x79622d32;        /* "yb-2"  */
-  ctx->input[3] = 0x6b206574;        /* "k et"  */
-
-  ctx->input[4] = buf_get_le32 (key + 0);
-  ctx->input[5] = buf_get_le32 (key + 4);
-  ctx->input[6] = buf_get_le32 (key + 8);
-  ctx->input[7] = buf_get_le32 (key + 12);
-
+  static const char sigma[16] = "expand 32-byte k";
+  static const char tau[16] = "expand 16-byte k";
+  const char *constants;
+
+  ctx->input[4] = buf_get_le32(key + 0);
+  ctx->input[5] = buf_get_le32(key + 4);
+  ctx->input[6] = buf_get_le32(key + 8);
+  ctx->input[7] = buf_get_le32(key + 12);
   if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
     {
-      ctx->input[8] = buf_get_le32 (key + 16);
-      ctx->input[9] = buf_get_le32 (key + 20);
-      ctx->input[10] = buf_get_le32 (key + 24);
-      ctx->input[11] = buf_get_le32 (key + 28);
+      key += 16;
+      constants = sigma;
     }
   else /* 128 bits */
     {
-      ctx->input[8] = ctx->input[4];
-      ctx->input[9] = ctx->input[5];
-      ctx->input[10] = ctx->input[6];
-      ctx->input[11] = ctx->input[7];
-
-      ctx->input[1] -= 0x02000000;        /* Change to "1 dn".  */
-      ctx->input[2] += 0x00000004;        /* Change to "yb-6".  */
+      constants = tau;
     }
+  ctx->input[8] = buf_get_le32(key + 0);
+  ctx->input[9] = buf_get_le32(key + 4);
+  ctx->input[10] = buf_get_le32(key + 8);
+  ctx->input[11] = buf_get_le32(key + 12);
+  ctx->input[0] = buf_get_le32(constants + 0);
+  ctx->input[1] = buf_get_le32(constants + 4);
+  ctx->input[2] = buf_get_le32(constants + 8);
+  ctx->input[3] = buf_get_le32(constants + 12);
 }
 
 
 static void
-chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen)
+chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
 {
   if (ivlen == CHACHA20_CTR_SIZE)
     {
       ctx->input[12] = buf_get_le32 (iv + 0);
       ctx->input[13] = buf_get_le32 (iv + 4);
       ctx->input[14] = buf_get_le32 (iv + 8);
       ctx->input[15] = buf_get_le32 (iv + 12);
     }
   else if (ivlen == CHACHA20_MAX_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = buf_get_le32 (iv + 0);
       ctx->input[14] = buf_get_le32 (iv + 4);
       ctx->input[15] = buf_get_le32 (iv + 8);
     }
   else if (ivlen == CHACHA20_MIN_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = buf_get_le32 (iv + 0);
       ctx->input[15] = buf_get_le32 (iv + 4);
     }
   else
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = 0;
       ctx->input[15] = 0;
     }
 }
 
 
+static void
+chacha20_setiv (void *context, const byte *iv, size_t ivlen)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+
+  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
+  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
+      && ivlen != CHACHA20_CTR_SIZE)
+    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
+
+  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
+             || ivlen == CHACHA20_CTR_SIZE))
+    chacha20_ivsetup (ctx, iv, ivlen);
+  else
+    chacha20_ivsetup (ctx, NULL, 0);
+
+  /* Reset the unused pad bytes counter.  */
+  ctx->unused = 0;
+}
+
+
 static gcry_err_code_t
-chacha20_do_setkey (CHACHA20_context_t * ctx,
-                    const byte * key, unsigned int keylen)
+chacha20_do_setkey (CHACHA20_context_t *ctx,
+                    const byte *key, unsigned int keylen)
 {
   static int initialized;
   static const char *selftest_failed;
   unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
         log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
     }
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
     return GPG_ERR_INV_KEYLEN;
 
-#ifdef USE_SSE2
-  ctx->blocks = _gcry_chacha20_amd64_sse2_blocks;
-#else
-  ctx->blocks = chacha20_blocks;
-#endif
-
 #ifdef USE_SSSE3
-  if (features & HWF_INTEL_SSSE3)
-    ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks;
+  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX2
-  if (features & HWF_INTEL_AVX2)
-    ctx->blocks = _gcry_chacha20_amd64_avx2_blocks;
+  ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
-#ifdef USE_NEON
-  if (features & HWF_ARM_NEON)
-    ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
+#ifdef USE_ARMV7_NEON
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
-
   (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
 
   /* We default to a zero nonce.  */
   chacha20_setiv (ctx, NULL, 0);
 
   return 0;
 }
 
 
 static gcry_err_code_t
-chacha20_setkey (void *context, const byte * key, unsigned int keylen)
+chacha20_setkey (void *context, const byte *key, unsigned int keylen)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
   _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
   return rc;
 }
 
 
 static void
-chacha20_setiv (void *context, const byte * iv, size_t ivlen)
+chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
+                         size_t length)
 {
+  static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
-
-  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
-  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
-      && ivlen != CHACHA20_CTR_SIZE)
-    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
-
-  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
-             || ivlen == CHACHA20_CTR_SIZE))
-    chacha20_ivsetup (ctx, iv, ivlen);
-  else
-    chacha20_ivsetup (ctx, NULL, 0);
-
-  /* Reset the unused pad bytes counter.  */
-  ctx->unused = 0;
-}
-
-
-
-/* Note: This function requires LENGTH > 0.  */
-static void
-chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
-                            byte * outbuf, const byte * inbuf, size_t length)
-{
   unsigned int nburn, burn = 0;
 
+  if (!length)
+    return;
+
   if (ctx->unused)
     {
-      unsigned char *p = (void *) ctx->pad;
+      unsigned char *p = ctx->pad;
       size_t n;
 
       gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
 
       n = ctx->unused;
       if (n > length)
         n = length;
+
       buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
       length -= n;
       outbuf += n;
       inbuf += n;
       ctx->unused -= n;
+
       if (!length)
         return;
       gcry_assert (!ctx->unused);
     }
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
+						nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
+						 nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_ARMV7_NEON
+  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
+						nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
-      size_t bytes = nblocks * CHACHA20_BLOCK_SIZE;
-      burn = ctx->blocks(ctx->input, inbuf, outbuf, bytes);
-      length -= bytes;
-      outbuf += bytes;
-      inbuf  += bytes;
+      nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 
   if (length > 0)
     {
-      nburn = chacha20_core (ctx->pad, ctx);
+      nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1);
       burn = nburn > burn ? nburn : burn;
 
       buf_xor (outbuf, inbuf, ctx->pad, length);
       ctx->unused = CHACHA20_BLOCK_SIZE - length;
     }
 
   _gcry_burn_stack (burn);
 }
 
 
-static void
-chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf,
-                         size_t length)
-{
-  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
-
-  if (length)
-    chacha20_do_encrypt_stream (ctx, outbuf, inbuf, length);
-}
-
-
 static const char *
 selftest (void)
 {
   byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
   CHACHA20_context_t *ctx;
   byte scratch[127 + 1];
   byte buf[512 + 64 + 4];
   int i;
 
   /* From draft-strombergson-chacha-test-vectors */
   static byte key_1[] = {
     0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
     0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
     0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
     0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
   };
   static const byte nonce_1[] =
     { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
   static const byte plaintext_1[127] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
   static const byte ciphertext_1[127] = {
     0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
     0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
     0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
     0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
     0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
     0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
     0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
     0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
     0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
     0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
     0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
     0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
     0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
     0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
     0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
     0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
   };
 
   /* 16-byte alignment required for amd64 implementation. */
   ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
 
   chacha20_setkey (ctx, key_1, sizeof key_1);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   scratch[sizeof (scratch) - 1] = 0;
   chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
   if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
     return "ChaCha20 encryption test 1 failed.";
   if (scratch[sizeof (scratch) - 1])
     return "ChaCha20 wrote too much.";
   chacha20_setkey (ctx, key_1, sizeof (key_1));
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
   if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
     return "ChaCha20 decryption test 1 failed.";
 
   for (i = 0; i < sizeof buf; i++)
     buf[i] = i;
   chacha20_setkey (ctx, key_1, sizeof key_1);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /*encrypt */
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   /*decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, 1);
   chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
   chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
                            buf + (sizeof buf) - 1, 1);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 2 failed.";
 
   chacha20_setkey (ctx, key_1, sizeof key_1);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /* encrypt */
   for (i = 0; i < sizeof buf; i++)
     chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
   /* decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 3 failed.";
 
   return NULL;
 }
 
 
 gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
   GCRY_CIPHER_CHACHA20,
   {0, 0},                       /* flags */
   "CHACHA20",                   /* name */
   NULL,                         /* aliases */
   NULL,                         /* oids */
   1,                            /* blocksize in bytes. */
   CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
   sizeof (CHACHA20_context_t),
   chacha20_setkey,
   NULL,
   NULL,
   chacha20_encrypt_stream,
   chacha20_encrypt_stream,
   NULL,
   NULL,
   chacha20_setiv
 };
diff --git a/configure.ac b/configure.ac
index c4b59f4d..a5aba144 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,2652 +1,2651 @@
 # Configure.ac script for Libgcrypt
 # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006,
 #               2007, 2008, 2009, 2011 Free Software Foundation, Inc.
 # Copyright (C) 2012-2017  g10 Code GmbH
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # (Process this file with autoconf to produce a configure script.)
 AC_REVISION($Revision$)
 AC_PREREQ(2.60)
 min_automake_version="1.14"
 
 # To build a release you need to create a tag with the version number
 # (git tag -s libgcrypt-n.m.k) and run "./autogen.sh --force".  Please
 # bump the version number immediately after the release and do another
 # commit and push so that the git magic is able to work.  See below
 # for the LT versions.
 m4_define(mym4_version_major, [1])
 m4_define(mym4_version_minor, [9])
 m4_define(mym4_version_micro, [0])
 
 # Below is m4 magic to extract and compute the revision number, the
 # decimalized short revision number, a beta version string, and a flag
 # indicating a development version (mym4_isgit). Note that the m4
 # processing is done by autoconf and not during the configure run.
 m4_define(mym4_version,
           [mym4_version_major.mym4_version_minor.mym4_version_micro])
 m4_define([mym4_revision],
           m4_esyscmd([git rev-parse --short HEAD | tr -d '\n\r']))
 m4_define([mym4_revision_dec],
           m4_esyscmd_s([echo $((0x$(echo ]mym4_revision[|head -c 4)))]))
 m4_define([mym4_betastring],
           m4_esyscmd_s([git describe --match 'libgcrypt-[0-9].*[0-9]' --long|\
                         awk -F- '$3!=0{print"-beta"$3}']))
 m4_define([mym4_isgit],m4_if(mym4_betastring,[],[no],[yes]))
 m4_define([mym4_full_version],[mym4_version[]mym4_betastring])
 
 AC_INIT([libgcrypt],[mym4_full_version],[http://bugs.gnupg.org])
 
 # LT Version numbers, remember to change them just *before* a release.
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
 LIBGCRYPT_LT_CURRENT=23
 LIBGCRYPT_LT_AGE=3
 LIBGCRYPT_LT_REVISION=0
 
 
 # If the API is changed in an incompatible way: increment the next counter.
 #
 # 1.6: ABI and API change but the change is to most users irrelevant
 #      and thus the API version number has not been incremented.
 LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
 NEED_GPG_ERROR_VERSION=1.25
 
 PACKAGE=$PACKAGE_NAME
 VERSION=$PACKAGE_VERSION
 
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_SRCDIR([src/libgcrypt.vers])
 AM_INIT_AUTOMAKE([serial-tests dist-bzip2])
 AC_CONFIG_HEADER(config.h)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_LIBOBJ_DIR([compat])
 AC_CANONICAL_HOST
 AM_MAINTAINER_MODE
 AM_SILENT_RULES
 
 AC_ARG_VAR(SYSROOT,[locate config scripts also below that directory])
 
 AH_TOP([
 #ifndef _GCRYPT_CONFIG_H_INCLUDED
 #define _GCRYPT_CONFIG_H_INCLUDED
 
 /* Enable gpg-error's strerror macro for W32CE.  */
 #define GPG_ERR_ENABLE_ERRNO_MACROS 1
 ])
 
 AH_BOTTOM([
 #define _GCRYPT_IN_LIBGCRYPT 1
 
 /* If the configure check for endianness has been disabled, get it from
    OS macros.  This is intended for making fat binary builds on OS X.  */
 #ifdef DISABLED_ENDIAN_CHECK
 # if defined(__BIG_ENDIAN__)
 #  define WORDS_BIGENDIAN 1
 # elif defined(__LITTLE_ENDIAN__)
 #  undef WORDS_BIGENDIAN
 # else
 #  error "No endianness found"
 # endif
 #endif /*DISABLED_ENDIAN_CHECK*/
 
 /* We basically use the original Camellia source.  Make sure the symbols
    properly prefixed.  */
 #define CAMELLIA_EXT_SYM_PREFIX _gcry_
 
 #endif /*_GCRYPT_CONFIG_H_INCLUDED*/
 ])
 
 AH_VERBATIM([_REENTRANT],
 [/* To allow the use of Libgcrypt in multithreaded programs we have to use
     special features from the library. */
 #ifndef _REENTRANT
 # define _REENTRANT 1
 #endif
 ])
 
 
 AC_SUBST(LIBGCRYPT_LT_CURRENT)
 AC_SUBST(LIBGCRYPT_LT_AGE)
 AC_SUBST(LIBGCRYPT_LT_REVISION)
 AC_SUBST(PACKAGE)
 AC_SUBST(VERSION)
 AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of this package])
 AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version of this package])
 VERSION_NUMBER=m4_esyscmd(printf "0x%02x%02x%02x" mym4_version_major \
                           mym4_version_minor mym4_version_micro)
 AC_SUBST(VERSION_NUMBER)
 
 
 ######################
 ##  Basic checks.  ### (we need some results later on (e.g. $GCC)
 ######################
 
 AC_PROG_MAKE_SET
 missing_dir=`cd $ac_aux_dir && pwd`
 AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir)
 AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir)
 AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir)
 AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir)
 # AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir)
 AC_PROG_CC
 AC_PROG_CPP
 AM_PROG_CC_C_O
 AM_PROG_AS
 AC_ISC_POSIX
 AC_PROG_INSTALL
 AC_PROG_AWK
 
 AC_GNU_SOURCE
 
 # We need to compile and run a program on the build machine.  A
 # comment in libgpg-error says that the AC_PROG_CC_FOR_BUILD macro in
 # the AC archive is broken for autoconf 2.57.  Given that there is no
 # newer version of that macro, we assume that it is also broken for
 # autoconf 2.61 and thus we use a simple but usually sufficient
 # approach.
 AC_MSG_CHECKING(for cc for build)
 if test "$cross_compiling" = "yes"; then
   CC_FOR_BUILD="${CC_FOR_BUILD-cc}"
 else
   CC_FOR_BUILD="${CC_FOR_BUILD-$CC}"
 fi
 AC_MSG_RESULT($CC_FOR_BUILD)
 AC_ARG_VAR(CC_FOR_BUILD,[build system C compiler])
 
 
 LT_PREREQ([2.2.6])
 LT_INIT([win32-dll disable-static])
 LT_LANG([Windows Resource])
 
 
 ##########################
 ## General definitions. ##
 ##########################
 
 # Used by libgcrypt-config
 LIBGCRYPT_CONFIG_LIBS="-lgcrypt"
 LIBGCRYPT_CONFIG_CFLAGS=""
 LIBGCRYPT_CONFIG_HOST="$host"
 
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
 available_pubkey_ciphers="dsa elgamal rsa ecc"
 enabled_pubkey_ciphers=""
 
 # Definitions for message digests.
 available_digests="crc gostr3411-94 md2 md4 md5 rmd160 sha1 sha256 sha512"
 available_digests="$available_digests sha3 tiger whirlpool stribog blake2"
 available_digests="$available_digests sm3"
 enabled_digests=""
 
 # Definitions for kdfs (optional ones)
 available_kdfs="s2k pkdf2 scrypt"
 enabled_kdfs=""
 
 # Definitions for random modules.
 available_random_modules="linux egd unix"
 auto_random_modules="$available_random_modules"
 
 # Supported thread backends.
 LIBGCRYPT_THREAD_MODULES=""
 
 # Other definitions.
 have_w32_system=no
 have_w32ce_system=no
 have_pthread=no
 
 
 # Setup some stuff depending on host.
 case "${host}" in
     *-*-mingw32*)
       ac_cv_have_dev_random=no
       have_w32_system=yes
       case "${host}" in
         *-mingw32ce*)
             have_w32ce_system=yes
             available_random_modules="w32ce"
             ;;
         *)
             available_random_modules="w32"
             ;;
       esac
       AC_DEFINE(USE_ONLY_8DOT3,1,
                 [set this to limit filenames to the 8.3 format])
       AC_DEFINE(HAVE_DRIVE_LETTERS,1,
                 [defined if we must run on a stupid file system])
       AC_DEFINE(HAVE_DOSISH_SYSTEM,1,
                 [defined if we run on some of the PCDOS like systems
                  (DOS, Windoze. OS/2) with special properties like
                   no file modes])
       ;;
 
     i?86-emx-os2 | i?86-*-os2*emx)
         # OS/2 with the EMX environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     i?86-*-msdosdjgpp*)
         # DOS with the DJGPP environment
         ac_cv_have_dev_random=no
         AC_DEFINE(HAVE_DRIVE_LETTERS)
         AC_DEFINE(HAVE_DOSISH_SYSTEM)
         ;;
 
     *-*-hpux*)
         if test -z "$GCC" ; then
             CFLAGS="$CFLAGS -Ae -D_HPUX_SOURCE"
         fi
         ;;
     *-dec-osf4*)
         if test -z "$GCC" ; then
             # Suppress all warnings
             # to get rid of the unsigned/signed char mismatch warnings.
             CFLAGS="$CFLAGS -w"
         fi
         ;;
     m68k-atari-mint)
         ;;
     *-apple-darwin*)
         AC_DEFINE(_DARWIN_C_SOURCE, 900000L,
                   Expose all libc features (__DARWIN_C_FULL).)
         ;;
     *)
       ;;
 esac
 
 if test "$have_w32_system" = yes; then
    AC_DEFINE(HAVE_W32_SYSTEM,1, [Defined if we run on a W32 API based system])
    if test "$have_w32ce_system" = yes; then
      AC_DEFINE(HAVE_W32CE_SYSTEM,1,[Defined if we run on WindowsCE])
    fi
 fi
 AM_CONDITIONAL(HAVE_W32_SYSTEM, test "$have_w32_system" = yes)
 AM_CONDITIONAL(HAVE_W32CE_SYSTEM, test "$have_w32ce_system" = yes)
 
 
 
 # A printable OS Name is sometimes useful.
 case "${host}" in
     *-*-mingw32ce*)
         PRINTABLE_OS_NAME="W32CE"
         ;;
 
     *-*-mingw32*)
         PRINTABLE_OS_NAME="W32"
         ;;
 
     i?86-emx-os2 | i?86-*-os2*emx )
         PRINTABLE_OS_NAME="OS/2"
         ;;
 
     i?86-*-msdosdjgpp*)
         PRINTABLE_OS_NAME="MSDOS/DJGPP"
         ;;
 
     *-linux*)
         PRINTABLE_OS_NAME="GNU/Linux"
         ;;
 
     *)
         PRINTABLE_OS_NAME=`uname -s || echo "Unknown"`
         ;;
 esac
 
 NAME_OF_DEV_RANDOM="/dev/random"
 NAME_OF_DEV_URANDOM="/dev/urandom"
 
 AC_ARG_ENABLE(endian-check,
               AC_HELP_STRING([--disable-endian-check],
 	      [disable the endian check and trust the OS provided macros]),
 	      endiancheck=$enableval,endiancheck=yes)
 if test x"$endiancheck" = xyes ; then
   AC_C_BIGENDIAN
 else
   AC_DEFINE(DISABLED_ENDIAN_CHECK,1,[configure did not test for endianness])
 fi
 
 AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
 
 if test "$ac_cv_sizeof_unsigned_short" = "0" \
    || test "$ac_cv_sizeof_unsigned_int" = "0" \
    || test "$ac_cv_sizeof_unsigned_long" = "0"; then
     AC_MSG_WARN([Hmmm, something is wrong with the sizes - using defaults]);
 fi
 
 # Ensure that we have UINT64_C before we bother to check for uint64_t
 AC_CACHE_CHECK([for UINT64_C],[gnupg_cv_uint64_c_works],
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <inttypes.h>]],
        [[uint64_t foo=UINT64_C(42);]])],
      gnupg_cv_uint64_c_works=yes,gnupg_cv_uint64_c_works=no))
 if test "$gnupg_cv_uint64_c_works" = "yes" ; then
    AC_CHECK_SIZEOF(uint64_t)
 fi
 
 # Do we have any 64-bit data types?
 if test "$ac_cv_sizeof_unsigned_int" != "8" \
    && test "$ac_cv_sizeof_unsigned_long" != "8" \
    && test "$ac_cv_sizeof_unsigned_long_long" != "8" \
    && test "$ac_cv_sizeof_uint64_t" != "8"; then
     AC_MSG_ERROR([[
 ***
 *** No 64-bit integer type available.
 *** It is not possible to build Libgcrypt on this platform.
 ***]])
 fi
 
 
 # If not specified otherwise, all available algorithms will be
 # included.
 default_ciphers="$available_ciphers"
 default_pubkey_ciphers="$available_pubkey_ciphers"
 default_digests="$available_digests"
 default_kdfs="$available_kdfs"
 # Blacklist MD2 by default
 default_digests=`echo $default_digests | sed -e 's/md2//g'`
 
 # Substitutions to set generated files in a Emacs buffer to read-only.
 AC_SUBST(emacs_local_vars_begin, ['Local Variables:'])
 AC_SUBST(emacs_local_vars_read_only, ['buffer-read-only: t'])
 AC_SUBST(emacs_local_vars_end, ['End:'])
 
 ############################
 ## Command line switches. ##
 ############################
 
 # Implementation of the --enable-ciphers switch.
 AC_ARG_ENABLE(ciphers,
 	      AC_HELP_STRING([--enable-ciphers=ciphers],
 			     [select the symmetric ciphers to include]),
 	      [enabled_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_ciphers=""])
 if test "x$enabled_ciphers" = "x" \
    -o "$enabled_ciphers" = "yes"  \
    -o "$enabled_ciphers" = "no"; then
    enabled_ciphers=$default_ciphers
 fi
 AC_MSG_CHECKING([which symmetric ciphers to include])
 for cipher in $enabled_ciphers; do
     LIST_MEMBER($cipher, $available_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported cipher "$cipher" specified])
     fi
 done
 AC_MSG_RESULT([$enabled_ciphers])
 
 # Implementation of the --enable-pubkey-ciphers switch.
 AC_ARG_ENABLE(pubkey-ciphers,
 	      AC_HELP_STRING([--enable-pubkey-ciphers=ciphers],
 			     [select the public-key ciphers to include]),
 	      [enabled_pubkey_ciphers=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_pubkey_ciphers=""])
 if test "x$enabled_pubkey_ciphers" = "x" \
    -o "$enabled_pubkey_ciphers" = "yes"  \
    -o "$enabled_pubkey_ciphers" = "no"; then
    enabled_pubkey_ciphers=$default_pubkey_ciphers
 fi
 AC_MSG_CHECKING([which public-key ciphers to include])
 for cipher in $enabled_pubkey_ciphers; do
     LIST_MEMBER($cipher, $available_pubkey_ciphers)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported public-key cipher specified])
     fi
 done
 AC_MSG_RESULT([$enabled_pubkey_ciphers])
 
 # Implementation of the --enable-digests switch.
 AC_ARG_ENABLE(digests,
 	      AC_HELP_STRING([--enable-digests=digests],
 			     [select the message digests to include]),
 	      [enabled_digests=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
 	      [enabled_digests=""])
 if test "x$enabled_digests" = "x" \
    -o "$enabled_digests" = "yes"  \
    -o "$enabled_digests" = "no"; then
    enabled_digests=$default_digests
 fi
 AC_MSG_CHECKING([which message digests to include])
 for digest in $enabled_digests; do
     LIST_MEMBER($digest, $available_digests)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported message digest specified])
     fi
 done
 AC_MSG_RESULT([$enabled_digests])
 
 # Implementation of the --enable-kdfs switch.
 AC_ARG_ENABLE(kdfs,
       AC_HELP_STRING([--enable-kfds=kdfs],
 		     [select the KDFs to include]),
       [enabled_kdfs=`echo $enableval | tr ',:' '  ' | tr '[A-Z]' '[a-z]'`],
       [enabled_kdfs=""])
 if test "x$enabled_kdfs" = "x" \
    -o "$enabled_kdfs" = "yes"  \
    -o "$enabled_kdfs" = "no"; then
    enabled_kdfs=$default_kdfs
 fi
 AC_MSG_CHECKING([which key derivation functions to include])
 for kdf in $enabled_kdfs; do
     LIST_MEMBER($kdf, $available_kdfs)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported key derivation function specified])
     fi
 done
 AC_MSG_RESULT([$enabled_kdfs])
 
 # Implementation of the --enable-random switch.
 AC_ARG_ENABLE(random,
 	      AC_HELP_STRING([--enable-random=name],
 	                     [select which random number generator to use]),
 	      [random=`echo $enableval | tr '[A-Z]' '[a-z]'`],
 	      [])
 if test "x$random" = "x" -o "$random" = "yes" -o "$random" = "no"; then
     random=default
 fi
 AC_MSG_CHECKING([which random module to use])
 if test "$random" != "default" -a "$random" != "auto"; then
     LIST_MEMBER($random, $available_random_modules)
     if test "$found" = "0"; then
        AC_MSG_ERROR([unsupported random module specified])
     fi
 fi
 AC_MSG_RESULT($random)
 
 # Implementation of the --disable-dev-random switch.
 AC_MSG_CHECKING([whether use of /dev/random is requested])
 AC_ARG_ENABLE(dev-random,
 [  --disable-dev-random    disable the use of dev random],
     try_dev_random=$enableval, try_dev_random=yes)
 AC_MSG_RESULT($try_dev_random)
 
 # Implementation of the --with-egd-socket switch.
 AC_ARG_WITH(egd-socket,
     [  --with-egd-socket=NAME  Use NAME for the EGD socket)],
             egd_socket_name="$withval", egd_socket_name="" )
 AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name",
                    [Define if you don't want the default EGD socket name.
                     For details see cipher/rndegd.c])
 
 # Implementation of the --enable-random-daemon
 AC_MSG_CHECKING([whether the experimental random daemon is requested])
 AC_ARG_ENABLE([random-daemon],
               AC_HELP_STRING([--enable-random-daemon],
                              [Build and support the experimental gcryptrnd]),
               [use_random_daemon=$enableval],
               [use_random_daemon=no])
 AC_MSG_RESULT($use_random_daemon)
 if test x$use_random_daemon = xyes ; then
     AC_DEFINE(USE_RANDOM_DAEMON,1,
               [Define to support the experimental random daemon])
 fi
 AM_CONDITIONAL(USE_RANDOM_DAEMON, test x$use_random_daemon = xyes)
 
 
 # Implementation of --disable-asm.
 AC_MSG_CHECKING([whether MPI assembler modules are requested])
 AC_ARG_ENABLE([asm],
               AC_HELP_STRING([--disable-asm],
 	                     [Disable MPI assembler modules]),
               [try_asm_modules=$enableval],
               [try_asm_modules=yes])
 AC_MSG_RESULT($try_asm_modules)
 
 # Implementation of the --enable-m-guard switch.
 AC_MSG_CHECKING([whether memory guard is requested])
 AC_ARG_ENABLE(m-guard,
               AC_HELP_STRING([--enable-m-guard],
                              [Enable memory guard facility]),
               [use_m_guard=$enableval], [use_m_guard=no])
 AC_MSG_RESULT($use_m_guard)
 if test "$use_m_guard" = yes ; then
     AC_DEFINE(M_GUARD,1,[Define to use the (obsolete) malloc guarding feature])
 fi
 
 # Implementation of the --enable-large-data-tests switch.
 AC_MSG_CHECKING([whether to run large data tests])
 AC_ARG_ENABLE(large-data-tests,
               AC_HELP_STRING([--enable-large-data-tests],
                  [Enable the real long ruinning large data tests]),
 	      large_data_tests=$enableval,large_data_tests=no)
 AC_MSG_RESULT($large_data_tests)
 AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests)
 
 
 # Implementation of the --with-capabilities switch.
 # Check whether we want to use Linux capabilities
 AC_MSG_CHECKING([whether use of capabilities is requested])
 AC_ARG_WITH(capabilities,
             AC_HELP_STRING([--with-capabilities],
                            [Use linux capabilities [default=no]]),
             [use_capabilities="$withval"],[use_capabilities=no])
 AC_MSG_RESULT($use_capabilities)
 
 # Implementation of the --enable-hmac-binary-check.
 AC_MSG_CHECKING([whether a HMAC binary check is requested])
 AC_ARG_ENABLE(hmac-binary-check,
               AC_HELP_STRING([--enable-hmac-binary-check],
                              [Enable library integrity check]),
               [use_hmac_binary_check=$enableval],
               [use_hmac_binary_check=no])
 AC_MSG_RESULT($use_hmac_binary_check)
 if test "$use_hmac_binary_check" = yes ; then
     AC_DEFINE(ENABLE_HMAC_BINARY_CHECK,1,
               [Define to support an HMAC based integrity check])
 fi
 
 
 # Implementation of the --disable-jent-support switch.
 AC_MSG_CHECKING([whether jitter entropy support is requested])
 AC_ARG_ENABLE(jent-support,
               AC_HELP_STRING([--disable-jent-support],
                         [Disable support for the Jitter entropy collector]),
 	      jentsupport=$enableval,jentsupport=yes)
 AC_MSG_RESULT($jentsupport)
 
 # Implementation of the --disable-padlock-support switch.
 AC_MSG_CHECKING([whether padlock support is requested])
 AC_ARG_ENABLE(padlock-support,
               AC_HELP_STRING([--disable-padlock-support],
        	         [Disable support for the PadLock Engine of VIA processors]),
 	      padlocksupport=$enableval,padlocksupport=yes)
 AC_MSG_RESULT($padlocksupport)
 
 # Implementation of the --disable-aesni-support switch.
 AC_MSG_CHECKING([whether AESNI support is requested])
 AC_ARG_ENABLE(aesni-support,
               AC_HELP_STRING([--disable-aesni-support],
                  [Disable support for the Intel AES-NI instructions]),
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
               AC_HELP_STRING([--disable-pclmul-support],
                  [Disable support for the Intel PCLMUL instructions]),
 	      pclmulsupport=$enableval,pclmulsupport=yes)
 AC_MSG_RESULT($pclmulsupport)
 
 # Implementation of the --disable-sse41-support switch.
 AC_MSG_CHECKING([whether SSE4.1 support is requested])
 AC_ARG_ENABLE(sse41-support,
               AC_HELP_STRING([--disable-sse41-support],
                  [Disable support for the Intel SSE4.1 instructions]),
 	      sse41support=$enableval,sse41support=yes)
 AC_MSG_RESULT($sse41support)
 
 # Implementation of the --disable-drng-support switch.
 AC_MSG_CHECKING([whether DRNG support is requested])
 AC_ARG_ENABLE(drng-support,
               AC_HELP_STRING([--disable-drng-support],
                  [Disable support for the Intel DRNG (RDRAND instruction)]),
 	      drngsupport=$enableval,drngsupport=yes)
 AC_MSG_RESULT($drngsupport)
 
 # Implementation of the --disable-avx-support switch.
 AC_MSG_CHECKING([whether AVX support is requested])
 AC_ARG_ENABLE(avx-support,
               AC_HELP_STRING([--disable-avx-support],
                  [Disable support for the Intel AVX instructions]),
 	      avxsupport=$enableval,avxsupport=yes)
 AC_MSG_RESULT($avxsupport)
 
 # Implementation of the --disable-avx2-support switch.
 AC_MSG_CHECKING([whether AVX2 support is requested])
 AC_ARG_ENABLE(avx2-support,
               AC_HELP_STRING([--disable-avx2-support],
                  [Disable support for the Intel AVX2 instructions]),
 	      avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
 # Implementation of the --disable-neon-support switch.
 AC_MSG_CHECKING([whether NEON support is requested])
 AC_ARG_ENABLE(neon-support,
               AC_HELP_STRING([--disable-neon-support],
                  [Disable support for the ARM NEON instructions]),
 	      neonsupport=$enableval,neonsupport=yes)
 AC_MSG_RESULT($neonsupport)
 
 # Implementation of the --disable-arm-crypto-support switch.
 AC_MSG_CHECKING([whether ARMv8 Crypto Extension support is requested])
 AC_ARG_ENABLE(arm-crypto-support,
               AC_HELP_STRING([--disable-arm-crypto-support],
                  [Disable support for the ARMv8 Crypto Extension instructions]),
 	      armcryptosupport=$enableval,armcryptosupport=yes)
 AC_MSG_RESULT($armcryptosupport)
 
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
               AC_HELP_STRING([--disable-O-flag-munging],
                  [Disable modification of the cc -O flag]),
               [enable_o_flag_munging=$enableval],
               [enable_o_flag_munging=yes])
 AC_MSG_RESULT($enable_o_flag_munging)
 AM_CONDITIONAL(ENABLE_O_FLAG_MUNGING, test "$enable_o_flag_munging" = "yes")
 
 # Implementation of the --disable-amd64-as-feature-detection switch.
 AC_MSG_CHECKING([whether to enable AMD64 as(1) feature detection])
 AC_ARG_ENABLE(amd64-as-feature-detection,
               AC_HELP_STRING([--disable-amd64-as-feature-detection],
                  [Disable the auto-detection of AMD64 as(1) features]),
 	      amd64_as_feature_detection=$enableval,
               amd64_as_feature_detection=yes)
 AC_MSG_RESULT($amd64_as_feature_detection)
 
 
 AC_DEFINE_UNQUOTED(PRINTABLE_OS_NAME, "$PRINTABLE_OS_NAME",
                    [A human readable text with the name of the OS])
 
 # For some systems we know that we have ld_version scripts.
 # Use it then as default.
 have_ld_version_script=no
 case "${host}" in
     *-*-linux*)
 	have_ld_version_script=yes
         ;;
     *-*-gnu*)
 	have_ld_version_script=yes
         ;;
 esac
 AC_ARG_ENABLE([ld-version-script],
               AC_HELP_STRING([--enable-ld-version-script],
                              [enable/disable use of linker version script.
                               (default is system dependent)]),
               [have_ld_version_script=$enableval],
               [ : ] )
 AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$have_ld_version_script" = "yes")
 
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_RANDOM, "$NAME_OF_DEV_RANDOM",
                    [defined to the name of the strong random device])
 AC_DEFINE_UNQUOTED(NAME_OF_DEV_URANDOM, "$NAME_OF_DEV_URANDOM",
                    [defined to the name of the weaker random device])
 
 
 ###############################
 #### Checks for libraries. ####
 ###############################
 
 #
 # gpg-error is required.
 #
 AM_PATH_GPG_ERROR("$NEED_GPG_ERROR_VERSION")
 if test "x$GPG_ERROR_LIBS" = "x"; then
   AC_MSG_ERROR([libgpg-error is needed.
                 See ftp://ftp.gnupg.org/gcrypt/libgpg-error/ .])
 fi
 
 AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT,
           [The default error source for libgcrypt.])
 
 #
 # Check whether the GNU Pth library is available.  We require this
 # to build the optional gcryptrnd program.
 #
 AC_ARG_WITH(pth-prefix,
             AC_HELP_STRING([--with-pth-prefix=PFX],
                            [prefix where GNU Pth is installed (optional)]),
      pth_config_prefix="$withval", pth_config_prefix="")
 if test x$pth_config_prefix != x ; then
    PTH_CONFIG="$pth_config_prefix/bin/pth-config"
 fi
 if test "$use_random_daemon" = "yes"; then
   AC_PATH_PROG(PTH_CONFIG, pth-config, no)
   if test "$PTH_CONFIG" = "no"; then
     AC_MSG_WARN([[
 ***
 *** To build the Libgcrypt's random number daemon
 *** we need the support of the GNU Portable Threads Library.
 *** Download it from ftp://ftp.gnu.org/gnu/pth/
 *** On a Debian GNU/Linux system you might want to try
 ***   apt-get install libpth-dev
 ***]])
   else
     GNUPG_PTH_VERSION_CHECK([1.3.7])
     if test $have_pth = yes; then
        PTH_CFLAGS=`$PTH_CONFIG --cflags`
        PTH_LIBS=`$PTH_CONFIG --ldflags`
        PTH_LIBS="$PTH_LIBS `$PTH_CONFIG --libs --all`"
        AC_DEFINE(USE_GNU_PTH, 1,
                 [Defined if the GNU Portable Thread Library should be used])
        AC_DEFINE(HAVE_PTH, 1,
                 [Defined if the GNU Pth is available])
     fi
   fi
 fi
 AC_SUBST(PTH_CFLAGS)
 AC_SUBST(PTH_LIBS)
 
 #
 # Check whether pthreads is available
 #
 if test "$have_w32_system" != yes; then
   AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes)
   if test "$have_pthread" = yes; then
     AC_DEFINE(HAVE_PTHREAD, 1 ,[Define if we have pthread.])
   fi
 fi
 
 
 # Solaris needs -lsocket and -lnsl. Unisys system includes
 # gethostbyname in libsocket but needs libnsl for socket.
 AC_SEARCH_LIBS(setsockopt, [socket], ,
 	[AC_SEARCH_LIBS(setsockopt, [socket], , , [-lnsl])])
 AC_SEARCH_LIBS(setsockopt, [nsl])
 
 ##################################
 #### Checks for header files. ####
 ##################################
 
 AC_HEADER_STDC
 AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h)
 INSERT_SYS_SELECT_H=
 if test x"$ac_cv_header_sys_select_h" = xyes; then
   INSERT_SYS_SELECT_H=" include <sys/select.h>"
 fi
 AC_SUBST(INSERT_SYS_SELECT_H)
 
 
 ##########################################
 #### Checks for typedefs, structures, ####
 ####  and compiler characteristics.   ####
 ##########################################
 
 AC_C_CONST
 AC_C_INLINE
 AC_TYPE_SIZE_T
 AC_TYPE_SIGNAL
 AC_DECL_SYS_SIGLIST
 AC_TYPE_PID_T
 
 GNUPG_CHECK_TYPEDEF(byte, HAVE_BYTE_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(ushort, HAVE_USHORT_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(ulong, HAVE_ULONG_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(u16, HAVE_U16_TYPEDEF)
 GNUPG_CHECK_TYPEDEF(u32, HAVE_U32_TYPEDEF)
 
 gl_TYPE_SOCKLEN_T
 case "${host}" in
   *-*-mingw32*)
     # socklen_t may or may not be defined depending on what headers
     # are included.  To be safe we use int as this is the actual type.
     FALLBACK_SOCKLEN_T="typedef int gcry_socklen_t;"
     ;;
   *)
     if test ".$gl_cv_socklen_t_equiv" = "."; then
       FALLBACK_SOCKLEN_T="typedef socklen_t gcry_socklen_t;"
     else
       FALLBACK_SOCKLEN_T="typedef ${gl_cv_socklen_t_equiv} gcry_socklen_t;"
     fi
 esac
 AC_SUBST(FALLBACK_SOCKLEN_T)
 
 
 #
 # Check for __builtin_bswap32 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap32,
        [gcry_cv_have_builtin_bswap32],
        [gcry_cv_have_builtin_bswap32=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [int x = 0; int y = __builtin_bswap32(x); return y;])],
           [gcry_cv_have_builtin_bswap32=yes])])
 if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP32,1,
              [Defined if compiler has '__builtin_bswap32' intrinsic])
 fi
 
 
 #
 # Check for __builtin_bswap64 intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_bswap64,
        [gcry_cv_have_builtin_bswap64],
        [gcry_cv_have_builtin_bswap64=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [long long x = 0; long long y = __builtin_bswap64(x); return y;])],
           [gcry_cv_have_builtin_bswap64=yes])])
 if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_BSWAP64,1,
              [Defined if compiler has '__builtin_bswap64' intrinsic])
 fi
 
 
 #
 # Check for __builtin_ctz intrinsic.
 #
 AC_CACHE_CHECK(for __builtin_ctz,
        [gcry_cv_have_builtin_ctz],
        [gcry_cv_have_builtin_ctz=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM([],
           [unsigned int x = 0; int y = __builtin_ctz(x); return y;])],
           [gcry_cv_have_builtin_ctz=yes])])
 if test "$gcry_cv_have_builtin_ctz" = "yes" ; then
    AC_DEFINE(HAVE_BUILTIN_CTZ, 1,
              [Defined if compiler has '__builtin_ctz' intrinsic])
 fi
 
 
 #
 # Check for VLA support (variable length arrays).
 #
 AC_CACHE_CHECK(whether the variable length arrays are supported,
        [gcry_cv_have_vla],
        [gcry_cv_have_vla=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void f1(char *, int);
             char foo(int i) {
               char b[(i < 0 ? 0 : i) + 1];
               f1(b, sizeof b); return b[0];}]])],
           [gcry_cv_have_vla=yes])])
 if test "$gcry_cv_have_vla" = "yes" ; then
    AC_DEFINE(HAVE_VLA,1, [Defined if variable length arrays are supported])
 fi
 
 
 #
 # Check for ELF visibility support.
 #
 AC_CACHE_CHECK(whether the visibility attribute is supported,
        gcry_cv_visibility_attribute,
        [gcry_cv_visibility_attribute=no
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo __attribute__ ((visibility ("hidden"))) = 1;
             int bar __attribute__ ((visibility ("protected"))) = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
             if grep '\.hidden.*foo' conftest.s >/dev/null 2>&1 ; then
                 if grep '\.protected.*bar' conftest.s >/dev/null 2>&1; then
                     gcry_cv_visibility_attribute=yes
                 fi
             fi
         fi
        ])
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken visibility attribute,
        gcry_cv_broken_visibility_attribute,
        [gcry_cv_broken_visibility_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[int foo (int x);
             int bar (int x) __asm__ ("foo")
                             __attribute__ ((visibility ("hidden")));
             int bar (int x) { return x; }
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep '\.hidden@<:@ 	_@:>@foo' conftest.s >/dev/null 2>&1;
             then
                gcry_cv_broken_visibility_attribute=no
            fi
         fi
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(for broken alias attribute,
        gcry_cv_broken_alias_attribute,
        [gcry_cv_broken_alias_attribute=yes
         AC_LANG_CONFTEST([AC_LANG_SOURCE(
           [[extern int foo (int x) __asm ("xyzzy");
             int bar (int x) { return x; }
             extern __typeof (bar) foo __attribute ((weak, alias ("bar")));
             extern int dfoo;
             extern __typeof (dfoo) dfoo __asm ("abccb");
             int dfoo = 1;
           ]])])
 
         if ${CC-cc} -Werror -S conftest.c -o conftest.s \
                   1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ; then
            if grep 'xyzzy' conftest.s >/dev/null 2>&1 && \
               grep 'abccb' conftest.s >/dev/null 2>&1; then
               gcry_cv_broken_alias_attribute=no
            fi
         fi
         ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes"; then
     AC_CACHE_CHECK(if gcc supports -fvisibility=hidden,
        gcry_cv_gcc_has_f_visibility,
        [gcry_cv_gcc_has_f_visibility=no
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-fvisibility=hidden"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],
                           gcry_cv_gcc_has_f_visibility=yes)
         CFLAGS=$_gcc_cflags_save;
        ])
 fi
 if test "$gcry_cv_visibility_attribute" = "yes" \
    && test "$gcry_cv_broken_visibility_attribute" != "yes" \
    && test "$gcry_cv_broken_alias_attribute" != "yes" \
    && test "$gcry_cv_gcc_has_f_visibility" = "yes"
  then
    AC_DEFINE(GCRY_USE_VISIBILITY, 1,
                [Define to use the GNU C visibility attribute.])
    CFLAGS="$CFLAGS -fvisibility=hidden"
 fi
 
 
 # Following attribute tests depend on warnings to cause compile to fail,
 # so set -Werror temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether the compiler supports the GCC style aligned attribute
 #
 AC_CACHE_CHECK([whether the GCC style aligned attribute is supported],
        [gcry_cv_gcc_attribute_aligned],
        [gcry_cv_gcc_attribute_aligned=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct { int a; } foo __attribute__ ((aligned (16)));]])],
           [gcry_cv_gcc_attribute_aligned=yes])])
 if test "$gcry_cv_gcc_attribute_aligned" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_ALIGNED,1,
      [Defined if a GCC style "__attribute__ ((aligned (n))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style packed attribute
 #
 AC_CACHE_CHECK([whether the GCC style packed attribute is supported],
        [gcry_cv_gcc_attribute_packed],
        [gcry_cv_gcc_attribute_packed=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[struct foolong_s { long b; } __attribute__ ((packed));
             struct foo_s { char a; struct foolong_s b; }
               __attribute__ ((packed));
             enum bar {
               FOO = 1 / (sizeof(struct foo_s) == (sizeof(char) + sizeof(long))),
             };]])],
           [gcry_cv_gcc_attribute_packed=yes])])
 if test "$gcry_cv_gcc_attribute_packed" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_PACKED,1,
      [Defined if a GCC style "__attribute__ ((packed))" is supported])
 fi
 
 
 #
 # Check whether the compiler supports the GCC style may_alias attribute
 #
 AC_CACHE_CHECK([whether the GCC style may_alias attribute is supported],
        [gcry_cv_gcc_attribute_may_alias],
        [gcry_cv_gcc_attribute_may_alias=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[typedef struct foo_s { int a; }
             __attribute__ ((may_alias)) foo_t;]])],
           [gcry_cv_gcc_attribute_may_alias=yes])])
 if test "$gcry_cv_gcc_attribute_may_alias" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MAY_ALIAS,1,
      [Defined if a GCC style "__attribute__ ((may_alias))" is supported])
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether the compiler supports 'asm' or '__asm__' keyword for
 # assembler blocks.
 #
 AC_CACHE_CHECK([whether 'asm' assembler keyword is supported],
        [gcry_cv_have_asm],
        [gcry_cv_have_asm=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm("":::"memory"); }]])],
           [gcry_cv_have_asm=yes])])
 AC_CACHE_CHECK([whether '__asm__' assembler keyword is supported],
        [gcry_cv_have___asm__],
        [gcry_cv_have___asm__=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { __asm__("":::"memory"); }]])],
           [gcry_cv_have___asm__=yes])])
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_DEFINE(asm,__asm__,
         [Define to supported assembler block keyword, if plain 'asm' was not
          supported])
    fi
 fi
 
 
 #
 # Check whether the compiler supports inline assembly memory barrier.
 #
 if test "$gcry_cv_have_asm" = "no" ; then
    if test "$gcry_cv_have___asm__" = "yes" ; then
       AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
           [gcry_cv_have_asm_volatile_memory],
           [gcry_cv_have_asm_volatile_memory=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void a(void) { __asm__ volatile("":::"memory"); }]])],
              [gcry_cv_have_asm_volatile_memory=yes])])
    fi
 else
    AC_CACHE_CHECK([whether inline assembly memory barrier is supported],
        [gcry_cv_have_asm_volatile_memory],
        [gcry_cv_have_asm_volatile_memory=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) { asm volatile("":::"memory"); }]])],
           [gcry_cv_have_asm_volatile_memory=yes])])
 fi
 if test "$gcry_cv_have_asm_volatile_memory" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_VOLATILE_MEMORY,1,
      [Define if inline asm memory barrier is supported])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARM
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations],
        [gcry_cv_gcc_arm_platform_as_ok],
        [gcry_cv_gcc_arm_platform_as_ok=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 /* Test if assembler supports UAL syntax.  */
                 ".syntax unified\n\t"
                 ".arm\n\t" /* our assembly code is in ARM mode  */
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
                 "add %r0, %r0, %r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
             );]])],
           [gcry_cv_gcc_arm_platform_as_ok=yes])])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARM assembly implementations])
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our ARMv8/Aarch64
 # implementations.  This needs to be done before setting up the
 # assembler stuff.
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly implementations],
        [gcry_cv_gcc_aarch64_platform_as_ok],
        [gcry_cv_gcc_aarch64_platform_as_ok=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 "asmfunc:\n\t"
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
             );]])],
           [gcry_cv_gcc_aarch64_platform_as_ok=yes])])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
    AC_DEFINE(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS,1,
      [Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly implementations])
 fi
 
 
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
 #
 GNUPG_SYS_SYMBOL_UNDERSCORE()
 
 
 #################################
 ####                         ####
 #### Setup assembler stuff.  ####
 #### Define mpi_cpu_arch.    ####
 ####                         ####
 #################################
 AC_ARG_ENABLE(mpi-path,
               AC_HELP_STRING([--enable-mpi-path=EXTRA_PATH],
 	      [prepend EXTRA_PATH to list of CPU specific optimizations]),
 	      mpi_extra_path="$enableval",mpi_extra_path="")
 AC_MSG_CHECKING(architecture and mpi assembler functions)
 if test -f $srcdir/mpi/config.links ; then
     . $srcdir/mpi/config.links
     AC_CONFIG_LINKS("$mpi_ln_list")
     ac_cv_mpi_sflags="$mpi_sflags"
     AC_MSG_RESULT($mpi_cpu_arch)
 else
     AC_MSG_RESULT(failed)
     AC_MSG_ERROR([mpi/config.links missing!])
 fi
 MPI_SFLAGS="$ac_cv_mpi_sflags"
 AC_SUBST(MPI_SFLAGS)
 
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_ADD1, test "$mpi_mod_asm_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_SUB1, test "$mpi_mod_asm_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL1, test "$mpi_mod_asm_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL2, test "$mpi_mod_asm_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_MUL3, test "$mpi_mod_asm_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_LSHIFT, test "$mpi_mod_asm_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_MPIH_RSHIFT, test "$mpi_mod_asm_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV, test "$mpi_mod_asm_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_ASM_UDIV_QRNND, test "$mpi_mod_asm_udiv_qrnnd" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_ADD1, test "$mpi_mod_c_mpih_add1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_SUB1, test "$mpi_mod_c_mpih_sub1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL1, test "$mpi_mod_c_mpih_mul1" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL2, test "$mpi_mod_c_mpih_mul2" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_MUL3, test "$mpi_mod_c_mpih_mul3" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_LSHIFT, test "$mpi_mod_c_mpih_lshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_MPIH_RSHIFT, test "$mpi_mod_c_mpih_rshift" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV, test "$mpi_mod_c_udiv" = yes)
 AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
    padlocksupport="n/a"
    jentsupport="n/a"
    drngsupport="n/a"
 fi
 
 if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
    fi
 fi
 
 
 #############################################
 ####                                     ####
 #### Platform specific compiler checks.  ####
 ####                                     ####
 #############################################
 
 
 # Following tests depend on warnings to cause compile to fail, so set -Werror
 # temporarily.
 _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'ms_abi' function attribute],
        [gcry_cv_gcc_attribute_ms_abi],
        [gcry_cv_gcc_attribute_ms_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((ms_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_ms_abi=yes])])
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_MS_ABI,1,
      [Defined if compiler supports "__attribute__ ((ms_abi))" function attribute])
 fi
 
 
 #
 # Check whether compiler supports 'sysv_abi' function attribute.
 #
 AC_CACHE_CHECK([whether compiler supports 'sysv_abi' function attribute],
        [gcry_cv_gcc_attribute_sysv_abi],
        [gcry_cv_gcc_attribute_sysv_abi=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[int __attribute__ ((sysv_abi)) proto(int);]])],
           [gcry_cv_gcc_attribute_sysv_abi=yes])])
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ATTRIBUTE_SYSV_ABI,1,
      [Defined if compiler supports "__attribute__ ((sysv_abi))" function attribute])
 fi
 
 
 #
 # Check whether default calling convention is 'ms_abi'.
 #
 if test "$gcry_cv_gcc_attribute_ms_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'ms_abi'],
           [gcry_cv_gcc_default_abi_is_ms_abi],
           [gcry_cv_gcc_default_abi_is_ms_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((ms_abi))(*msabi_func)(void);
                  /* warning on SysV abi targets, passes on Windows based targets */
                  msabi_func = def_func;
                  return msabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_ms_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_MS_ABI,1,
         [Defined if default calling convention is 'ms_abi'])
    fi
 fi
 
 
 #
 # Check whether default calling convention is 'sysv_abi'.
 #
 if test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" ; then
    AC_CACHE_CHECK([whether default calling convention is 'sysv_abi'],
           [gcry_cv_gcc_default_abi_is_sysv_abi],
           [gcry_cv_gcc_default_abi_is_sysv_abi=no
            AC_COMPILE_IFELSE([AC_LANG_SOURCE(
              [[void *test(void) {
                  void *(*def_func)(void) = test;
                  void *__attribute__((sysv_abi))(*sysvabi_func)(void);
                  /* warning on MS ABI targets, passes on SysV ABI targets */
                  sysvabi_func = def_func;
                  return sysvabi_func;
              }]])],
              [gcry_cv_gcc_default_abi_is_sysv_abi=yes])])
    if test "$gcry_cv_gcc_default_abi_is_sysv_abi" = "yes" ; then
       AC_DEFINE(HAVE_GCC_DEFAULT_ABI_IS_SYSV_ABI,1,
         [Defined if default calling convention is 'sysv_abi'])
    fi
 fi
 
 
 # Restore flags.
 CFLAGS=$_gcc_cflags_save;
 
 
 #
 # Check whether GCC inline assembler supports SSSE3 instructions
 # This is required for the AES-NI instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSSE3 instructions],
        [gcry_cv_gcc_inline_asm_ssse3],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_ssse3="n/a"
         else
           gcry_cv_gcc_inline_asm_ssse3=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
               { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
             void a(void) {
               __asm__("pshufb %[mask], %%xmm2\n\t"::[mask]"m"(*be_mask):);
             }]])],
           [gcry_cv_gcc_inline_asm_ssse3=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_ssse3" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSSE3,1,
      [Defined if inline assembler supports SSSE3 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports PCLMUL instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
        [gcry_cv_gcc_inline_asm_pclmul],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_pclmul="n/a"
         else
           gcry_cv_gcc_inline_asm_pclmul=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
             }]])],
           [gcry_cv_gcc_inline_asm_pclmul=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1,
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports SSE4.1 instructions],
        [gcry_cv_gcc_inline_asm_sse41],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_sse41="n/a"
         else
           gcry_cv_gcc_inline_asm_sse41=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               int i;
               __asm__("pextrd \$2, %%xmm0, %[out]\n\t" : [out] "=m" (i));
             }]])],
           [gcry_cv_gcc_inline_asm_sse41=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_sse41" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_SSE41,1,
      [Defined if inline assembler supports SSE4.1 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
        [gcry_cv_gcc_inline_asm_avx],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_avx="n/a"
         else
           gcry_cv_gcc_inline_asm_avx=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
             }]])],
           [gcry_cv_gcc_inline_asm_avx=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1,
      [Defined if inline assembler supports AVX instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AVX2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions],
        [gcry_cv_gcc_inline_asm_avx2],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_avx2="n/a"
         else
           gcry_cv_gcc_inline_asm_avx2=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc");
             }]])],
           [gcry_cv_gcc_inline_asm_avx2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX2,1,
      [Defined if inline assembler supports AVX2 instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
        [gcry_cv_gcc_inline_asm_bmi2],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_inline_asm_bmi2="n/a"
         else
           gcry_cv_gcc_inline_asm_bmi2=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[void a(void) {
               __asm__("rorxl \$23, %%eax, %%edx\\n\\t":::"memory");
             }]])],
           [gcry_cv_gcc_inline_asm_bmi2=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1,
      [Defined if inline assembler supports BMI2 instructions])
 fi
 
 
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler handles division correctly],
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
     #
     # Add '-Wa,--divide' to CPPFLAGS and try check again.
     #
     _gcc_cppflags_save="$CPPFLAGS"
     CPPFLAGS="$CPPFLAGS -Wa,--divide"
     AC_CACHE_CHECK([whether GCC assembler handles division correctly with "-Wa,--divide"],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
             [[__asm__("xorl \$(123456789/12345678), %ebp;\n\t");]])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
       # '-Wa,--divide' did not work, restore old flags.
       CPPFLAGS="$_gcc_cppflags_save"
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for our amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
   AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations],
        [gcry_cv_gcc_amd64_platform_as_ok],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_amd64_platform_as_ok="n/a"
         else
           gcry_cv_gcc_amd64_platform_as_ok=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
 		"asmfunc:\n\t"
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,@function;\n\t"
 		/* Test if assembler allows use of '/' for constant division
 		 * (Solaris/x86 issue). If previous constant division check
 		 * and "-Wa,--divide" workaround failed, this causes assembly
 		 * to be disable on this machine. */
 		"xorl \$(123456789/12345678), %ebp;\n\t"
             );]])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
      AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
               [Defined if underlying assembler is compatible with amd64 assembly implementations])
   fi
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
       [gcry_cv_gcc_win64_platform_as_ok],
       [gcry_cv_gcc_win64_platform_as_ok=no
       AC_COMPILE_IFELSE([AC_LANG_SOURCE(
         [[__asm__(
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
           );]])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
                 [Defined if underlying assembler is compatible with WIN64 assembly implementations])
     fi
   fi
 fi
 
 
 #
 # Check whether GCC assembler supports features needed for assembly
 # implementations that use Intel syntax
 #
 AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations],
        [gcry_cv_gcc_platform_as_ok_for_intel_syntax],
        [if test "$mpi_cpu_arch" != "x86" ; then
           gcry_cv_gcc_platform_as_ok_for_intel_syntax="n/a"
         else
           gcry_cv_gcc_platform_as_ok_for_intel_syntax=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".intel_syntax noprefix\n\t"
                 "pxor xmm1, xmm7;\n\t"
                 /* Intel syntax implementation also use GAS macros, so check
                  * for them here. */
                 "VAL_A = xmm4\n\t"
                 "VAL_B = xmm2\n\t"
                 ".macro SET_VAL_A p1\n\t"
                 "  VAL_A = \\\\p1 \n\t"
                 ".endm\n\t"
                 ".macro SET_VAL_B p1\n\t"
                 "  VAL_B = \\\\p1 \n\t"
                 ".endm\n\t"
                 "vmovdqa VAL_A, VAL_B;\n\t"
                 "SET_VAL_A eax\n\t"
                 "SET_VAL_B ebp\n\t"
                 "add VAL_A, VAL_B;\n\t"
                 "add VAL_B, 0b10101;\n\t"
             );]])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
   AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1,
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
 
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
 AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture],
        [gcry_cv_cc_arm_arch_is_v6],
        [if test "$mpi_cpu_arch" != "arm" ; then
           gcry_cv_cc_arm_arch_is_v6="n/a"
         else
           gcry_cv_cc_arm_arch_is_v6=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[
            #if defined(__arm__) && \
              ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \
              || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
              || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
              || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \
              || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
              || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
              || defined(__ARM_ARCH_7EM__))
              /* empty */
            #else
              /* fail compile if not ARMv6. */
              not_armv6 not_armv6 = (not_armv6)not_armv6;
            #endif
           ]])],
           [gcry_cv_cc_arm_arch_is_v6=yes])
         fi])
 if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
    AC_DEFINE(HAVE_ARM_ARCH_V6,1,
      [Defined if ARM architecture is v6 or newer])
 fi
 
 
 #
 # Check whether GCC inline assembler supports NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
        [gcry_cv_gcc_inline_asm_neon],
        [if test "$mpi_cpu_arch" != "arm" ; then
           gcry_cv_gcc_inline_asm_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_neon=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arm\n\t"
                 ".fpu neon\n\t"
                 "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
                 "vrev64.8 %q0, %q3;\n\t"
                 "vadd.u64 %q0, %q1;\n\t"
                 "vadd.s64 %d3, %d2, %d3;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1,
      [Defined if inline assembler supports NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch32 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch32_crypto],
        [if test "$mpi_cpu_arch" != "arm" ; then
           gcry_cv_gcc_inline_asm_aarch32_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch32_crypto=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".syntax unified\n\t"
                 ".arch armv8-a\n\t"
                 ".arm\n\t"
                 ".fpu crypto-neon-fp-armv8\n\t"
 
                 "sha1h.32 q0, q0;\n\t"
                 "sha1c.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha1su0.32 q0, q0, q0;\n\t"
                 "sha1su1.32 q0, q0;\n\t"
 
                 "sha256h.32 q0, q0, q0;\n\t"
                 "sha256h2.32 q0, q0, q0;\n\t"
                 "sha1p.32 q0, q0, q0;\n\t"
                 "sha256su0.32 q0, q0;\n\t"
                 "sha256su1.32 q0, q0, q15;\n\t"
 
                 "aese.8 q0, q0;\n\t"
                 "aesd.8 q0, q0;\n\t"
                 "aesmc.8 q0, q0;\n\t"
                 "aesimc.8 q0, q0;\n\t"
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO,1,
      [Defined if inline assembler supports AArch32 Crypto Extension instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 NEON instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions],
        [gcry_cv_gcc_inline_asm_aarch64_neon],
        [if test "$mpi_cpu_arch" != "aarch64" ; then
           gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_neon=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".cpu generic+simd\n\t"
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_NEON,1,
      [Defined if inline assembler supports AArch64 NEON instructions])
 fi
 
 
 #
 # Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension instructions],
        [gcry_cv_gcc_inline_asm_aarch64_crypto],
        [if test "$mpi_cpu_arch" != "aarch64" ; then
           gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
         else
           gcry_cv_gcc_inline_asm_aarch64_crypto=no
           AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
                 ".cpu generic+simd+crypto\n\t"
 
                 "mov w0, \#42;\n\t"
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
 
                 "sha1h s0, s0;\n\t"
                 "sha1c q0, s0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
                 "sha1su1 v0.4s, v0.4s;\n\t"
 
                 "sha256h q0, q0, v0.4s;\n\t"
                 "sha256h2 q0, q0, v0.4s;\n\t"
                 "sha1p q0, s0, v0.4s;\n\t"
                 "sha256su0 v0.4s, v0.4s;\n\t"
                 "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
 
                 "aese v0.16b, v0.16b;\n\t"
                 "aesd v0.16b, v0.16b;\n\t"
                 "aesmc v0.16b, v0.16b;\n\t"
                 "aesimc v0.16b, v0.16b;\n\t"
 
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
             ]])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
 if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
    AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO,1,
      [Defined if inline assembler supports AArch64 Crypto Extension instructions])
 fi
 
 
 #######################################
 #### Checks for library functions. ####
 #######################################
 
 AC_FUNC_VPRINTF
 # We have replacements for these in src/missing-string.c
 AC_CHECK_FUNCS(stpcpy strcasecmp)
 # We have replacements for these in src/g10lib.h
 AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 # Other checks
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile)
 
 GNUPG_CHECK_MLOCK
 
 #
 # Replacement functions.
 #
 AC_REPLACE_FUNCS([getpid clock])
 
 
 #
 # Check whether it is necessary to link against libdl.
 #
 DL_LIBS=""
 if test "$use_hmac_binary_check" = yes ; then
   _gcry_save_libs="$LIBS"
   LIBS=""
   AC_SEARCH_LIBS(dlopen, c dl,,,)
   DL_LIBS=$LIBS
   LIBS="$_gcry_save_libs"
   LIBGCRYPT_CONFIG_LIBS="${LIBGCRYPT_CONFIG_LIBS} ${DL_LIBS}"
 fi
 AC_SUBST(DL_LIBS)
 
 
 #
 # Check whether we can use Linux capabilities as requested.
 #
 if test "$use_capabilities" = "yes" ; then
 use_capabilities=no
 AC_CHECK_HEADERS(sys/capability.h)
 if test "$ac_cv_header_sys_capability_h" = "yes" ; then
   AC_CHECK_LIB(cap, cap_init, ac_need_libcap=1)
   if test "$ac_cv_lib_cap_cap_init" = "yes"; then
      AC_DEFINE(USE_CAPABILITIES,1,
                [define if capabilities should be used])
      LIBS="$LIBS -lcap"
      use_capabilities=yes
   fi
 fi
 if test "$use_capabilities" = "no" ; then
     AC_MSG_WARN([[
 ***
 *** The use of capabilities on this system is not possible.
 *** You need a recent Linux kernel and some patches:
 ***   fcaps-2.2.9-990610.patch      (kernel patch for 2.2.9)
 ***   fcap-module-990613.tar.gz     (kernel module)
 ***   libcap-1.92.tar.gz            (user mode library and utilities)
 *** And you have to configure the kernel with CONFIG_VFS_CAP_PLUGIN
 *** set (filesystems menu). Be warned: This code is *really* ALPHA.
 ***]])
 fi
 fi
 
 # Check whether a random device is available.
 if test "$try_dev_random" = yes ; then
     AC_CACHE_CHECK(for random device, ac_cv_have_dev_random,
     [if test -r "$NAME_OF_DEV_RANDOM" && test -r "$NAME_OF_DEV_URANDOM" ; then
       ac_cv_have_dev_random=yes; else ac_cv_have_dev_random=no; fi])
     if test "$ac_cv_have_dev_random" = yes; then
         AC_DEFINE(HAVE_DEV_RANDOM,1,
                  [defined if the system supports a random device] )
     fi
 else
     AC_MSG_CHECKING(for random device)
     ac_cv_have_dev_random=no
     AC_MSG_RESULT(has been disabled)
 fi
 
 # Figure out the random modules for this configuration.
 if test "$random" = "default"; then
 
     # Select default value.
     if test "$ac_cv_have_dev_random" = yes; then
         # Try Linuxish random device.
         random_modules="linux"
     else
         case "${host}" in
         *-*-mingw32ce*)
           # WindowsCE random device.
           random_modules="w32ce"
           ;;
         *-*-mingw32*|*-*-cygwin*)
           # Windows random device.
           random_modules="w32"
           ;;
         *)
           # Build everything, allow to select at runtime.
           random_modules="$auto_random_modules"
           ;;
         esac
     fi
 else
     if test "$random" = "auto"; then
         # Build everything, allow to select at runtime.
         random_modules="$auto_random_modules"
     else
         random_modules="$random"
     fi
 fi
 
 
 #
 # Other defines
 #
 if test mym4_isgit = "yes"; then
     AC_DEFINE(IS_DEVELOPMENT_VERSION,1,
               [Defined if this is not a regular release])
 fi
 
 
 AM_CONDITIONAL(CROSS_COMPILING, test x$cross_compiling = xyes)
 
 
 # This is handy for debugging so the compiler doesn't rearrange
 # things and eliminate variables.
 AC_ARG_ENABLE(optimization,
        AC_HELP_STRING([--disable-optimization],
 		      [disable compiler optimization]),
                       [if test $enableval = no ; then
                          CFLAGS=`echo $CFLAGS | sed 's/-O[[0-9]]//'`
                        fi])
 
 # CFLAGS mangling when using gcc.
 if test "$GCC" = yes; then
     CFLAGS="$CFLAGS -Wall"
     if test "$USE_MAINTAINER_MODE" = "yes"; then
         CFLAGS="$CFLAGS -Wcast-align -Wshadow -Wstrict-prototypes"
         CFLAGS="$CFLAGS -Wformat -Wno-format-y2k -Wformat-security"
 
         # If -Wno-missing-field-initializers is supported we can enable a
         # a bunch of really useful warnings.
         AC_MSG_CHECKING([if gcc supports -Wno-missing-field-initializers])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wno-missing-field-initializers"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -W -Wextra -Wbad-function-cast"
           CFLAGS="$CFLAGS -Wwrite-strings"
           CFLAGS="$CFLAGS -Wdeclaration-after-statement"
           CFLAGS="$CFLAGS -Wno-missing-field-initializers"
           CFLAGS="$CFLAGS -Wno-sign-compare"
         fi
 
         AC_MSG_CHECKING([if gcc supports -Wpointer-arith])
         _gcc_cflags_save=$CFLAGS
         CFLAGS="-Wpointer-arith"
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[])],_gcc_wopt=yes,_gcc_wopt=no)
         AC_MSG_RESULT($_gcc_wopt)
         CFLAGS=$_gcc_cflags_save;
         if test x"$_gcc_wopt" = xyes ; then
           CFLAGS="$CFLAGS -Wpointer-arith"
         fi
     fi
 
 fi
 
 # Check whether as(1) supports a noeexecstack feature.  This test
 # includes an override option.
 CL_AS_NOEXECSTACK
 
 
 AC_SUBST(LIBGCRYPT_CONFIG_API_VERSION)
 AC_SUBST(LIBGCRYPT_CONFIG_LIBS)
 AC_SUBST(LIBGCRYPT_CONFIG_CFLAGS)
 AC_SUBST(LIBGCRYPT_CONFIG_HOST)
 AC_SUBST(LIBGCRYPT_THREAD_MODULES)
 
 AC_CONFIG_COMMANDS([gcrypt-conf],[[
 chmod +x src/libgcrypt-config
 ]],[[
 prefix=$prefix
 exec_prefix=$exec_prefix
 libdir=$libdir
 datadir=$datadir
 DATADIRNAME=$DATADIRNAME
 ]])
 
 #####################
 #### Conclusion. ####
 #####################
 
 # Check that requested feature can actually be used and define
 # ENABLE_foo_SUPPORT macros.
 
 if test x"$aesnisupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_ssse3" != "yes" ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$sse41support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_sse41" != "yes" ; then
     sse41support="no (unsupported by compiler)"
   fi
 fi
 if test x"$avxsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then
     avxsupport="no (unsupported by compiler)"
   fi
 fi
 if test x"$avx2support" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx2" != "yes" ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
       neonsupport="no (unsupported by compiler)"
     fi
   fi
 fi
 
 if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
 fi
 if test x"$sse41support" = xyes ; then
   AC_DEFINE(ENABLE_SSE41_SUPPORT, 1,
             [Enable support for Intel SSE4.1 instructions.])
 fi
 if test x"$avxsupport" = xyes ; then
   AC_DEFINE(ENABLE_AVX_SUPPORT,1,
             [Enable support for Intel AVX instructions.])
 fi
 if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
 if test x"$neonsupport" = xyes ; then
   AC_DEFINE(ENABLE_NEON_SUPPORT,1,
             [Enable support for ARM NEON instructions.])
 fi
 if test x"$armcryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1,
             [Enable support for ARMv8 Crypto Extension instructions.])
 fi
 if test x"$jentsupport" = xyes ; then
   AC_DEFINE(ENABLE_JENT_SUPPORT, 1,
             [Enable support for the jitter entropy collector.])
 fi
 if test x"$padlocksupport" = xyes ; then
   AC_DEFINE(ENABLE_PADLOCK_SUPPORT, 1,
             [Enable support for the PadLock engine.])
 fi
 if test x"$drngsupport" = xyes ; then
   AC_DEFINE(ENABLE_DRNG_SUPPORT, 1,
             [Enable support for Intel DRNG (RDRAND instruction).])
 fi
 
 
 # Define conditional sources and config.h symbols depending on the
 # selected ciphers, pubkey-ciphers, digests, kdfs, and random modules.
 
 LIST_MEMBER(arcfour, $enabled_ciphers)
 if test "$found" = "1"; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
    AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-arm.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo"
    AC_DEFINE(USE_DES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS des-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(aes, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo"
    AC_DEFINE(USE_AES, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo"
 
          # Build with the SSSE3 implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo"
 
          # Build with the ARMv8/AArch32 CE implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aarch64.lo"
 
          # Build with the ARMv8/AArch64 CE implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
    esac
 
    case "$mpi_cpu_arch" in
      x86)
          # Build with the AES-NI implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-aesni.lo"
 
          # Build with the Padlock implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-padlock.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo"
    AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo"
 
          if test x"$avx2support" = xyes ; then
             # Build with the AVX2 implementation
             GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-avx2-amd64.lo"
          fi
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-aarch64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the SSE2 implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-sse2-amd64.lo"
       ;;
    esac
 
    if test x"$avx2support" = xyes ; then
       # Build with the AVX2 implementation
       GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-avx2-amd64.lo"
    fi
 
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rfc2268.lo"
    AC_DEFINE(USE_RFC2268, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(seed, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS seed.lo"
    AC_DEFINE(USE_SEED, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(camellia, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo"
    AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included])
 
    case "${host}" in
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-arm.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aarch64.lo"
       ;;
    esac
 
    if test x"$avxsupport" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX implementation
         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx-amd64.lo"
       fi
    fi
 
    if test x"$avx2support" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX2 implementation
         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx2-amd64.lo"
       fi
    fi
 fi
 
 LIST_MEMBER(idea, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS idea.lo"
    AC_DEFINE(USE_IDEA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(salsa20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo"
    AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-amd64.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(gost28147, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS gost28147.lo"
    AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(chacha20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
    AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-sse2-amd64.lo"
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo"
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-ssse3.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-amd64-avx2.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
    AC_DEFINE(USE_DSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS rsa.lo"
    AC_DEFINE(USE_RSA, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(elgamal, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS elgamal.lo"
    AC_DEFINE(USE_ELGAMAL, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(ecc, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS \
                           ecc.lo ecc-curves.lo ecc-misc.lo \
                           ecc-ecdsa.lo ecc-eddsa.lo ecc-gost.lo"
    AC_DEFINE(USE_ECC, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(crc, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo"
    AC_DEFINE(USE_CRC, 1, [Defined if this module should be included])
 
    case "${host}" in
       i?86-*-* | x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(gostr3411-94, $enabled_digests)
 if test "$found" = "1" ; then
    # GOST R 34.11-94 internally uses GOST 28147-89
    LIST_MEMBER(gost28147, $enabled_ciphers)
    if test "$found" = "1" ; then
       GCRYPT_DIGESTS="$GCRYPT_DIGESTS gostr3411-94.lo"
       AC_DEFINE(USE_GOST_R_3411_94, 1, [Defined if this module should be included])
    fi
 fi
 
 LIST_MEMBER(stribog, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS stribog.lo"
    AC_DEFINE(USE_GOST_R_3411_12, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md2.lo"
    AC_DEFINE(USE_MD2, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md4, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md4.lo"
    AC_DEFINE(USE_MD4, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(md5, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS md5.lo"
    AC_DEFINE(USE_MD5, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(rmd160, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo"
    AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(sha256, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo"
    AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch32-ce.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch64-ce.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(sha512, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo"
    AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-arm.lo"
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(sha3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak.lo"
    AC_DEFINE(USE_SHA3, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          :
       ;;
    esac
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
      GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 
 LIST_MEMBER(tiger, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS tiger.lo"
    AC_DEFINE(USE_TIGER, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(whirlpool, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool.lo"
    AC_DEFINE(USE_WHIRLPOOL, 1, [Defined if this module should be included])
 
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS whirlpool-sse2-amd64.lo"
       ;;
    esac
 fi
 
 LIST_MEMBER(blake2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo"
    AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included])
 fi
 
 # SHA-1 needs to be included always for example because it is used by
 # random-csprng.c.
 GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo"
 AC_DEFINE(USE_SHA1, 1,   [Defined if this module should be included])
 
 case "${host}" in
   x86_64-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo"
   ;;
   arm*-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv7-neon.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch32-ce.lo"
   ;;
   aarch64-*-*)
     # Build with the assembly implementation
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv8-aarch64-ce.lo"
   ;;
 esac
 
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
    AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
    AC_DEFINE(USE_SCRYPT, 1, [Defined if this module should be included])
 fi
 
 LIST_MEMBER(linux, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndlinux.lo"
    AC_DEFINE(USE_RNDLINUX, 1, [Defined if the /dev/random RNG should be used.])
 fi
 
 LIST_MEMBER(unix, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndunix.lo"
    AC_DEFINE(USE_RNDUNIX, 1, [Defined if the default Unix RNG should be used.])
 fi
 
 LIST_MEMBER(egd, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndegd.lo"
    AC_DEFINE(USE_RNDEGD, 1, [Defined if the EGD based RNG should be used.])
 fi
 
 LIST_MEMBER(w32, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32.lo"
    AC_DEFINE(USE_RNDW32, 1,
              [Defined if the Windows specific RNG should be used.])
 fi
 
 LIST_MEMBER(w32ce, $random_modules)
 if test "$found" = "1" ; then
    GCRYPT_RANDOM="$GCRYPT_RANDOM rndw32ce.lo"
    AC_DEFINE(USE_RNDW32CE, 1,
              [Defined if the WindowsCE specific RNG should be used.])
 fi
 
 AC_SUBST([GCRYPT_CIPHERS])
 AC_SUBST([GCRYPT_PUBKEY_CIPHERS])
 AC_SUBST([GCRYPT_DIGESTS])
 AC_SUBST([GCRYPT_KDFS])
 AC_SUBST([GCRYPT_RANDOM])
 
 AC_SUBST(LIBGCRYPT_CIPHERS, $enabled_ciphers)
 AC_SUBST(LIBGCRYPT_PUBKEY_CIPHERS, $enabled_pubkey_ciphers)
 AC_SUBST(LIBGCRYPT_DIGESTS, $enabled_digests)
 
 # For printing the configuration we need a colon separated list of
 # algorithm names.
 tmp=`echo "$enabled_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_CIPHERS, "$tmp",
                    [List of available cipher algorithms])
 tmp=`echo "$enabled_pubkey_ciphers" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_PUBKEY_CIPHERS, "$tmp",
                    [List of available public key cipher algorithms])
 tmp=`echo "$enabled_digests" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_DIGESTS, "$tmp",
                    [List of available digest algorithms])
 tmp=`echo "$enabled_kdfs" | tr ' ' : `
 AC_DEFINE_UNQUOTED(LIBGCRYPT_KDFS, "$tmp",
                    [List of available KDF algorithms])
 
 
 #
 # Define conditional sources depending on the used hardware platform.
 # Note that all possible modules must also be listed in
 # src/Makefile.am (EXTRA_libgcrypt_la_SOURCES).
 #
 GCRYPT_HWF_MODULES=
 case "$mpi_cpu_arch" in
      x86)
         AC_DEFINE(HAVE_CPU_ARCH_X86, 1,   [Defined for the x86 platforms])
         GCRYPT_HWF_MODULES="hwf-x86.lo"
         ;;
      alpha)
         AC_DEFINE(HAVE_CPU_ARCH_ALPHA, 1, [Defined for Alpha platforms])
         ;;
      sparc)
         AC_DEFINE(HAVE_CPU_ARCH_SPARC, 1, [Defined for SPARC platforms])
         ;;
      mips)
         AC_DEFINE(HAVE_CPU_ARCH_MIPS, 1,  [Defined for MIPS platforms])
         ;;
      m68k)
         AC_DEFINE(HAVE_CPU_ARCH_M68K, 1,  [Defined for M68k platforms])
         ;;
      ppc)
         AC_DEFINE(HAVE_CPU_ARCH_PPC, 1,   [Defined for PPC platforms])
         ;;
      arm)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM platforms])
         GCRYPT_HWF_MODULES="hwf-arm.lo"
         ;;
      aarch64)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM AArch64 platforms])
         GCRYPT_HWF_MODULES="hwf-arm.lo"
         ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
 
 
 #
 # Option to disable building of doc file
 #
 build_doc=yes
 AC_ARG_ENABLE([doc], AC_HELP_STRING([--disable-doc],
                                     [do not build the documentation]),
                      build_doc=$enableval, build_doc=yes)
 AM_CONDITIONAL([BUILD_DOC], [test "x$build_doc" != xno])
 
 
 #
 # Provide information about the build.
 #
 BUILD_REVISION="mym4_revision"
 AC_SUBST(BUILD_REVISION)
 AC_DEFINE_UNQUOTED(BUILD_REVISION, "$BUILD_REVISION",
                    [GIT commit id revision used to build this package])
 
 changequote(,)dnl
 BUILD_FILEVERSION=`echo "$VERSION" | sed 's/\([0-9.]*\).*/\1./;s/\./,/g'`
 changequote([,])dnl
 BUILD_FILEVERSION="${BUILD_FILEVERSION}mym4_revision_dec"
 AC_SUBST(BUILD_FILEVERSION)
 
 AC_ARG_ENABLE([build-timestamp],
   AC_HELP_STRING([--enable-build-timestamp],
                  [set an explicit build timestamp for reproducibility.
                   (default is the current time in ISO-8601 format)]),
      [if test "$enableval" = "yes"; then
         BUILD_TIMESTAMP=`date -u +%Y-%m-%dT%H:%M+0000 2>/dev/null || date`
       else
         BUILD_TIMESTAMP="$enableval"
       fi],
      [BUILD_TIMESTAMP="<none>"])
 AC_SUBST(BUILD_TIMESTAMP)
 AC_DEFINE_UNQUOTED(BUILD_TIMESTAMP, "$BUILD_TIMESTAMP",
                    [The time this package was configured for a build])
 
 
 # And create the files.
 AC_CONFIG_FILES([
 Makefile
 m4/Makefile
 compat/Makefile
 mpi/Makefile
 cipher/Makefile
 random/Makefile
 doc/Makefile
 src/Makefile
 src/gcrypt.h
 src/libgcrypt-config
 src/versioninfo.rc
 tests/Makefile
 ])
 AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g])
 AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf])
 AC_OUTPUT
 
 
 detection_module="${GCRYPT_HWF_MODULES%.lo}"
 test -n "$detection_module" || detection_module="none"
 
 # Give some feedback
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Libgcrypt],[v${VERSION} has been configured as follows:])
 GCRY_MSG_SHOW([],[])
 GCRY_MSG_SHOW([Platform:                 ],[$PRINTABLE_OS_NAME ($host)])
 GCRY_MSG_SHOW([Hardware detection module:],[$detection_module])
 GCRY_MSG_WRAP([Enabled cipher algorithms:],[$enabled_ciphers])
 GCRY_MSG_WRAP([Enabled digest algorithms:],[$enabled_digests])
 GCRY_MSG_WRAP([Enabled kdf algorithms:   ],[$enabled_kdfs])
 GCRY_MSG_WRAP([Enabled pubkey algorithms:],[$enabled_pubkey_ciphers])
 GCRY_MSG_SHOW([Random number generator:  ],[$random])
 GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
 GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([Try using ARMv8 crypto:   ],[$armcryptosupport])
 GCRY_MSG_SHOW([],[])
 
 if test "x${gpg_config_script_warn}" != x; then
 cat <<G10EOF
         Mismatches between the target platform and the to
         be used libraries have been been detected for:
          ${gpg_config_script_warn}
         Please check above for warning messages.
 
 G10EOF
 fi
 
 if test "$gcry_cv_gcc_attribute_aligned" != "yes" ; then
 cat <<G10EOF
    Please not that your compiler does not support the GCC style
    aligned attribute. Using this software may evoke bus errors.
 
 G10EOF
 fi
 
 if test -n "$gpl"; then
   echo "Please note that you are building a version of Libgcrypt with"
   echo "  $gpl"
   echo "included.  These parts are licensed under the GPL and thus the"
   echo "use of this library has to comply with the conditions of the GPL."
   echo ""
 fi