diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 98320ca5..16066bfc 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -1,147 +1,148 @@
 # Makefile for cipher modules
 # Copyright (C) 1998, 1999, 2000, 2001, 2002,
 #               2003, 2009 Free Software Foundation, Inc.
 #
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
 # Libgcrypt is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 # Process this file with automake to produce Makefile.in
 
 # Need to include ../src in addition to top_srcdir because gcrypt.h is
 # a built header.
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 
 AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
 
 EXTRA_DIST = gost-s-box.c
 
 CLEANFILES = gost-s-box
 DISTCLEANFILES = gost-sb.h
 
 noinst_LTLIBRARIES = libcipher.la
 
 GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
                  @GCRYPT_DIGESTS@ @GCRYPT_KDFS@
 
 libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
 libcipher_la_LIBADD = $(GCRYPT_MODULES)
 
 libcipher_la_SOURCES = \
 	cipher.c cipher-internal.h \
 	cipher-cbc.c \
 	cipher-cfb.c \
 	cipher-ofb.c \
 	cipher-ctr.c \
 	cipher-aeswrap.c \
 	cipher-ccm.c \
 	cipher-cmac.c \
 	cipher-gcm.c cipher-gcm-intel-pclmul.c \
 	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
 	cipher-poly1305.c \
 	cipher-ocb.c \
 	cipher-xts.c \
 	cipher-eax.c \
 	cipher-selftest.c cipher-selftest.h \
 	pubkey.c pubkey-internal.h pubkey-util.c \
 	md.c \
 	mac.c mac-internal.h \
 	mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
 	poly1305.c poly1305-internal.h \
 	kdf.c kdf-internal.h \
 	hmac-tests.c \
 	bithelp.h  \
 	bufhelp.h  \
 	primegen.c  \
 	hash-common.c hash-common.h \
 	dsa-common.c rsa-common.c \
 	sha1.h
 
 EXTRA_libcipher_la_SOURCES = \
 	asm-common-amd64.h \
 	asm-common-aarch64.h \
+	asm-poly1305-amd64.h \
 	arcfour.c arcfour-amd64.S \
 	blowfish.c blowfish-amd64.S blowfish-arm.S \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-armv7-neon.S chacha20-aarch64.S \
 	crc.c crc-intel-pclmul.c \
 	des.c des-amd64.S \
 	dsa.c \
 	elgamal.c \
 	ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
 	ecc-ecdsa.c ecc-eddsa.c ecc-gost.c \
 	idea.c \
 	gost28147.c gost.h \
 	gostr3411-94.c \
 	md4.c \
 	md5.c \
 	rijndael.c rijndael-internal.h rijndael-tables.h   \
 	rijndael-aesni.c rijndael-padlock.c                \
 	rijndael-amd64.S rijndael-arm.S                    \
 	rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
 	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
 	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
 	rmd160.c \
 	rsa.c \
 	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
 	scrypt.c \
 	seed.c \
 	serpent.c serpent-sse2-amd64.S \
 	serpent-avx2-amd64.S serpent-armv7-neon.S \
 	sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
 	sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \
 	sha1-intel-shaext.c \
 	sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
 	sha256-avx2-bmi2-amd64.S \
 	sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
 	sha256-intel-shaext.c \
 	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
 	sha512-avx2-bmi2-amd64.S \
 	sha512-armv7-neon.S sha512-arm.S \
 	sm3.c \
 	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 	stribog.c \
 	tiger.c \
 	whirlpool.c whirlpool-sse2-amd64.S \
 	twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
 	twofish-avx2-amd64.S \
 	rfc2268.c \
 	camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
 	camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
 	blake2.c \
 	blake2b-amd64-avx2.S blake2s-amd64-avx.S
 
 gost28147.lo: gost-sb.h
 gost-sb.h: gost-s-box
 	./gost-s-box $@
 
 gost-s-box: gost-s-box.c
 	$(CC_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c
 
 
 if ENABLE_O_FLAG_MUNGING
 o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g'
 else
 o_flag_munging = cat
 endif
 
 
 # We need to lower the optimization for this module.
 tiger.o: $(srcdir)/tiger.c
 	`echo $(COMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
 
 tiger.lo: $(srcdir)/tiger.c
 	`echo $(LTCOMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
diff --git a/cipher/asm-poly1305-amd64.h b/cipher/asm-poly1305-amd64.h
new file mode 100644
index 00000000..3f99ea3e
--- /dev/null
+++ b/cipher/asm-poly1305-amd64.h
@@ -0,0 +1,171 @@
+/* asm-common-amd64.h  -  Poly1305 macros for AMD64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_AMD64_H
+#define GCRY_ASM_POLY1305_AMD64_H
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305 AMD64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE    %r8
+#define POLY_RSRC      %r9
+
+#define POLY_R_H0      %rbx
+#define POLY_R_H1      %rcx
+#define POLY_R_H2      %r10
+#define POLY_R_H2d     %r10d
+#define POLY_R_R0      %r11
+#define POLY_R_R1_MUL5 %r12
+#define POLY_R_X0_HI   %r13
+#define POLY_R_X0_LO   %r14
+#define POLY_R_X1_HI   %r15
+#define POLY_R_X1_LO   %rsi
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1      (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define POLY1305_LOAD_STATE() \
+	movq POLY_S_H0, POLY_R_H0; \
+	movq POLY_S_H1, POLY_R_H1; \
+	movl POLY_S_H2d, POLY_R_H2d; \
+	movq POLY_S_R0, POLY_R_R0; \
+	movq POLY_S_R1, POLY_R_R1_MUL5; \
+	shrq $2, POLY_R_R1_MUL5; \
+	addq POLY_S_R1, POLY_R_R1_MUL5;
+
+#define POLY1305_STORE_STATE() \
+	movq POLY_R_H0, POLY_S_H0; \
+	movq POLY_R_H1, POLY_S_H1; \
+	movl POLY_R_H2d, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1(src_offset) \
+	addq ((src_offset) + 0 * 8)(POLY_RSRC), POLY_R_H0; \
+	adcq ((src_offset) + 1 * 8)(POLY_RSRC), POLY_R_H1; \
+	adcl $1, POLY_R_H2d; \
+	\
+	/* h = a * r (partial mod 2^130-5): */ \
+	\
+	/* h0 * r1 */ \
+	movq POLY_R_H0, %rax; \
+	mulq POLY_S_R1; \
+	movq %rax, POLY_R_X1_LO; \
+	movq %rdx, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART2() \
+	\
+	/* h0 * r0 */ \
+	movq POLY_R_H0, %rax; \
+	mulq POLY_R_R0; \
+	movq %rax, POLY_R_X0_LO; \
+	movq %rdx, POLY_R_X0_HI;
+
+#define POLY1305_BLOCK_PART3() \
+	\
+	/* h1 * r0 */ \
+	movq POLY_R_H1, %rax; \
+	mulq POLY_R_R0; \
+	addq %rax, POLY_R_X1_LO; \
+	adcq %rdx, POLY_R_X1_HI; \
+	\
+	/* h1 * r1 mod 2^130-5 */ \
+	movq POLY_R_R1_MUL5, %rax; \
+	mulq POLY_R_H1;
+
+#define POLY1305_BLOCK_PART4() \
+	movq POLY_R_H2, POLY_R_H1; \
+	imulq POLY_R_R1_MUL5, POLY_R_H1; /* h2 * r1 mod 2^130-5 */ \
+	addq %rax, POLY_R_X0_LO; \
+	adcq %rdx, POLY_R_X0_HI; \
+	imulq POLY_R_R0, POLY_R_H2;      /* h2 * r0 */ \
+	addq POLY_R_X1_LO, POLY_R_H1; \
+	adcq POLY_R_X1_HI, POLY_R_H2;
+
+#define POLY1305_BLOCK_PART5() \
+	\
+	/* carry propagation */ \
+	movq POLY_R_H2, POLY_R_H0; \
+	andl $3, POLY_R_H2d; \
+	shrq $2, POLY_R_H0; \
+	leaq (POLY_R_H0, POLY_R_H0, 4), POLY_R_H0; \
+	addq POLY_R_X0_LO, POLY_R_H0; \
+	adcq POLY_R_X0_HI, POLY_R_H1; \
+	adcl $0, POLY_R_H2d;
+
+#ifdef TESTING_POLY1305_ASM
+/* for testing only, mixed C/asm poly1305.c is marginally faster (~2%). */
+.align 8
+.globl _gcry_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_poly1305_amd64_ssse3_blocks1,@function;)
+
+_gcry_poly1305_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: poly1305-state
+	 *	%rsi: src
+	 *	%rdx: nblks
+	 */
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $(10 * 8), %rsp;
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+
+	movq %rdx, (8 * 8)(%rsp); # NBLKS
+
+	movq %rdi, POLY_RSTATE;
+	movq %rsi, POLY_RSRC;
+
+	POLY1305_LOAD_STATE();
+
+.L_poly1:
+	POLY1305_BLOCK_PART1(0 * 16);
+	POLY1305_BLOCK_PART2();
+	POLY1305_BLOCK_PART3();
+	POLY1305_BLOCK_PART4();
+	POLY1305_BLOCK_PART5();
+
+	subq $1, (8 * 8)(%rsp); # NBLKS
+	leaq (16)(POLY_RSRC), POLY_RSRC;
+	jnz .L_poly1;
+
+	POLY1305_STORE_STATE();
+
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave
+	ret;
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index dad9e3e9..ef02c173 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -1,323 +1,752 @@
 /* chacha20-amd64-avx2.S  -  AVX2 implementation of ChaCha20 cipher
  *
-
- * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 .text
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
 
 /* register macros */
 #define INPUT %rdi
 #define DST   %rsi
 #define SRC   %rdx
 #define NBLKS %rcx
 #define ROUND %eax
 
 /* stack structure */
 #define STACK_VEC_X12 (32)
 #define STACK_VEC_X13 (32 + STACK_VEC_X12)
 #define STACK_TMP     (32 + STACK_VEC_X13)
 #define STACK_TMP1    (32 + STACK_TMP)
 #define STACK_TMP2    (32 + STACK_TMP1)
 
 #define STACK_MAX     (32 + STACK_TMP2)
 
 /* vector registers */
 #define X0 %ymm0
 #define X1 %ymm1
 #define X2 %ymm2
 #define X3 %ymm3
 #define X4 %ymm4
 #define X5 %ymm5
 #define X6 %ymm6
 #define X7 %ymm7
 #define X8 %ymm8
 #define X9 %ymm9
 #define X10 %ymm10
 #define X11 %ymm11
 #define X12 %ymm12
 #define X13 %ymm13
 #define X14 %ymm14
 #define X15 %ymm15
 
 #define X0h %xmm0
 #define X1h %xmm1
 #define X2h %xmm2
 #define X3h %xmm3
 #define X4h %xmm4
 #define X5h %xmm5
 #define X6h %xmm6
 #define X7h %xmm7
 #define X8h %xmm8
 #define X9h %xmm9
 #define X10h %xmm10
 #define X11h %xmm11
 #define X12h %xmm12
 #define X13h %xmm13
 #define X14h %xmm14
 #define X15h %xmm15
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0,x1,x2,x3,t1,t2) \
 	vpunpckhdq x1, x0, t2; \
 	vpunpckldq x1, x0, x0; \
 	\
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
 	vpunpckhqdq t1,	x0, x1; \
 	vpunpcklqdq t1,	x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
 	vpunpcklqdq x2,	t2, x2;
 
 /**********************************************************************
   8-way chacha20
  **********************************************************************/
 
 #define ROTATE2(v1,v2,c,tmp)	\
 	vpsrld $(32 - (c)), v1, tmp;	\
 	vpslld $(c), v1, v1;		\
 	vpaddb tmp, v1, v1;		\
 	vpsrld $(32 - (c)), v2, tmp;	\
 	vpslld $(c), v2, v2;		\
 	vpaddb tmp, v2, v2;
 
 #define ROTATE_SHUF_2(v1,v2,shuf)	\
 	vpshufb shuf, v1, v1;		\
 	vpshufb shuf, v2, v2;
 
 #define XOR(ds,s) \
 	vpxor s, ds, ds;
 
 #define PLUS(ds,s) \
 	vpaddd s, ds, ds;
 
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1)		\
-	vbroadcasti128 .Lshuf_rol16 RIP, tmp1;			\
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\
+		      interleave_op1,interleave_op2,\
+		      interleave_op3,interleave_op4)		\
+	vbroadcasti128 .Lshuf_rol16 rRIP, tmp1;			\
+		interleave_op1;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op2;					\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1);				\
-	vbroadcasti128 .Lshuf_rol8 RIP, tmp1;			\
+	vbroadcasti128 .Lshuf_rol8 rRIP, tmp1;			\
+		interleave_op3;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op4;					\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1);
 
 #define BUF_XOR_256_TO_128(dst, src, offset_lo, offset_hi, yreg, tmp1)	\
 	vextracti128 $1, yreg, tmp1##h;					\
 	vpxor offset_lo(src), yreg##h, yreg##h;				\
 	vpxor offset_hi(src), tmp1##h, tmp1##h;				\
 	vmovdqu yreg##h, offset_lo(dst);				\
 	vmovdqu tmp1##h, offset_hi(dst);
 
 .align 32
 chacha20_data:
 .Lshuf_rol16:
 	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
 	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
 .Linc_counter:
 	.byte 0,1,2,3,4,5,6,7
 .Lunsigned_cmp:
 	.long 0x80000000
 
 .align 8
 .globl _gcry_chacha20_amd64_avx2_blocks8
 ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;)
 
 _gcry_chacha20_amd64_avx2_blocks8:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 8)
 	 */
 
 	vzeroupper;
 
 	pushq %rbp;
 	movq %rsp, %rbp;
 
 	subq $STACK_MAX, %rsp;
 	andq $~31, %rsp;
 
-.Loop4:
+.Loop8:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
-	vpmovzxbd .Linc_counter RIP, X0;
-	vpbroadcastd .Lunsigned_cmp RIP, X2;
+	vpmovzxbd .Linc_counter rRIP, X0;
+	vpbroadcastd .Lunsigned_cmp rRIP, X2;
 	vpbroadcastd (12 * 4)(INPUT), X12;
 	vpbroadcastd (13 * 4)(INPUT), X13;
 	vpaddd X0, X12, X12;
 	vpxor X2, X0, X0;
 	vpxor X2, X12, X1;
 	vpcmpgtd X1, X0, X0;
 	vpsubd X0, X13, X13;
 	vmovdqa X12, (STACK_VEC_X12)(%rsp);
 	vmovdqa X13, (STACK_VEC_X13)(%rsp);
 
 	/* Load vectors */
 	vpbroadcastd (0 * 4)(INPUT), X0;
 	vpbroadcastd (1 * 4)(INPUT), X1;
 	vpbroadcastd (2 * 4)(INPUT), X2;
 	vpbroadcastd (3 * 4)(INPUT), X3;
 	vpbroadcastd (4 * 4)(INPUT), X4;
 	vpbroadcastd (5 * 4)(INPUT), X5;
 	vpbroadcastd (6 * 4)(INPUT), X6;
 	vpbroadcastd (7 * 4)(INPUT), X7;
 	vpbroadcastd (8 * 4)(INPUT), X8;
 	vpbroadcastd (9 * 4)(INPUT), X9;
 	vpbroadcastd (10 * 4)(INPUT), X10;
 	vpbroadcastd (11 * 4)(INPUT), X11;
 	vpbroadcastd (14 * 4)(INPUT), X14;
 	vpbroadcastd (15 * 4)(INPUT), X15;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 
 .Lround2:
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,,,,)
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8)
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8)
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,,,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,,,,)
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15)
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,,,,)
 	sub $2, ROUND;
 	jnz .Lround2;
 
 	/* tmp := X15 */
 	vpbroadcastd (0 * 4)(INPUT), X15;
 	PLUS(X0, X15);
 	vpbroadcastd (1 * 4)(INPUT), X15;
 	PLUS(X1, X15);
 	vpbroadcastd (2 * 4)(INPUT), X15;
 	PLUS(X2, X15);
 	vpbroadcastd (3 * 4)(INPUT), X15;
 	PLUS(X3, X15);
 	vpbroadcastd (4 * 4)(INPUT), X15;
 	PLUS(X4, X15);
 	vpbroadcastd (5 * 4)(INPUT), X15;
 	PLUS(X5, X15);
 	vpbroadcastd (6 * 4)(INPUT), X15;
 	PLUS(X6, X15);
 	vpbroadcastd (7 * 4)(INPUT), X15;
 	PLUS(X7, X15);
 	vpbroadcastd (8 * 4)(INPUT), X15;
 	PLUS(X8, X15);
 	vpbroadcastd (9 * 4)(INPUT), X15;
 	PLUS(X9, X15);
 	vpbroadcastd (10 * 4)(INPUT), X15;
 	PLUS(X10, X15);
 	vpbroadcastd (11 * 4)(INPUT), X15;
 	PLUS(X11, X15);
 	vmovdqa (STACK_VEC_X12)(%rsp), X15;
 	PLUS(X12, X15);
 	vmovdqa (STACK_VEC_X13)(%rsp), X15;
 	PLUS(X13, X15);
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X13, (STACK_TMP)(%rsp);
 	vpbroadcastd (14 * 4)(INPUT), X13;
 	PLUS(X14, X13);
 	vmovdqa X14, (STACK_TMP1)(%rsp);
 	vpbroadcastd (15 * 4)(INPUT), X13;
 	PLUS(X15, X13);
 	vmovdqa X15, (STACK_TMP2)(%rsp);
 
 	/* Update counter */
 	addq $8, (12 * 4)(INPUT);
 
 	transpose_4x4(X0, X1, X2, X3, X13, X14);
 	transpose_4x4(X4, X5, X6, X7, X13, X14);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
 	vmovdqa (STACK_TMP)(%rsp), X13;
 	vmovdqa (STACK_TMP1)(%rsp), X14;
 	vmovdqa (STACK_TMP2)(%rsp), X15;
 	transpose_4x4(X8, X9, X10, X11, X0, X1);
 	transpose_4x4(X12, X13, X14, X15, X0, X1);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
 	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
 
 	sub $8, NBLKS;
 	lea (8 * 64)(DST), DST;
 	lea (8 * 64)(SRC), SRC;
-	jnz .Loop4;
+	jnz .Loop8;
 
 	/* clear the used vector registers and stack */
 	vpxor X0, X0, X0;
 	vmovdqa X0, (STACK_VEC_X12)(%rsp);
 	vmovdqa X0, (STACK_VEC_X13)(%rsp);
 	vmovdqa X0, (STACK_TMP)(%rsp);
 	vmovdqa X0, (STACK_TMP1)(%rsp);
 	vmovdqa X0, (STACK_TMP2)(%rsp);
 	vzeroall;
 
 	/* eax zeroed by round loop. */
 	leave;
 	ret;
 ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_amd64_avx2_blocks8;)
 
+/**********************************************************************
+  8-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
+ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8,@function;)
+
+_gcry_chacha20_poly1305_amd64_avx2_blocks8:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 8)
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(8 * 8) + STACK_MAX + 32, %rsp;
+	andq $~31, %rsp;
+
+	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+
+	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	/* Load state */
+	POLY1305_LOAD_STATE();
+
+.Loop_poly8:
+
+	/* Construct counter vectors X12 and X13 */
+	vpmovzxbd .Linc_counter rRIP, X0;
+	vpbroadcastd .Lunsigned_cmp rRIP, X2;
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd X0, X12, X12;
+	vpxor X2, X0, X0;
+	vpxor X2, X12, X1;
+	vpcmpgtd X1, X0, X0;
+	vpsubd X0, X13, X13;
+	vmovdqa X12, (STACK_VEC_X12)(%rsp);
+	vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	vpbroadcastd (0 * 4)(INPUT), X0;
+	vpbroadcastd (1 * 4)(INPUT), X1;
+	vpbroadcastd (2 * 4)(INPUT), X2;
+	vpbroadcastd (3 * 4)(INPUT), X3;
+	vpbroadcastd (4 * 4)(INPUT), X4;
+	vpbroadcastd (5 * 4)(INPUT), X5;
+	vpbroadcastd (6 * 4)(INPUT), X6;
+	vpbroadcastd (7 * 4)(INPUT), X7;
+	vpbroadcastd (8 * 4)(INPUT), X8;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vpbroadcastd (14 * 4)(INPUT), X14;
+	vpbroadcastd (15 * 4)(INPUT), X15;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+
+	# rounds 0,1
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(0 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(1 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(2 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(3 * 16))
+
+	# rounds 2,3
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(4 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(5 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(6 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	# rounds 4,5
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(7 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(8 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(9 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	# rounds 6,7
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(10 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(11 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(12 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	# rounds 8,9
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(13 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(14 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(15 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	# rounds 10,11
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(16 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(17 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(18 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(19 * 16))
+
+	# rounds 12,13
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(20 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(21 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(22 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	# rounds 14,15
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(23 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART1(24 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(25 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	# rounds 16,17
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(26 * 16),
+		      POLY1305_BLOCK_PART2())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(27 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART1(28 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	# rounds 18,19
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(29 * 16),
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(30 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(31 * 16))
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	/* tmp := X15 */
+	vpbroadcastd (0 * 4)(INPUT), X15;
+	PLUS(X0, X15);
+	vpbroadcastd (1 * 4)(INPUT), X15;
+	PLUS(X1, X15);
+	vpbroadcastd (2 * 4)(INPUT), X15;
+	PLUS(X2, X15);
+	vpbroadcastd (3 * 4)(INPUT), X15;
+	PLUS(X3, X15);
+	vpbroadcastd (4 * 4)(INPUT), X15;
+	PLUS(X4, X15);
+	vpbroadcastd (5 * 4)(INPUT), X15;
+	PLUS(X5, X15);
+	vpbroadcastd (6 * 4)(INPUT), X15;
+	PLUS(X6, X15);
+	vpbroadcastd (7 * 4)(INPUT), X15;
+	PLUS(X7, X15);
+	vpbroadcastd (8 * 4)(INPUT), X15;
+	PLUS(X8, X15);
+	vpbroadcastd (9 * 4)(INPUT), X15;
+	PLUS(X9, X15);
+	vpbroadcastd (10 * 4)(INPUT), X15;
+	PLUS(X10, X15);
+	vpbroadcastd (11 * 4)(INPUT), X15;
+	PLUS(X11, X15);
+	vmovdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	vmovdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X13, (STACK_TMP)(%rsp);
+	vpbroadcastd (14 * 4)(INPUT), X13;
+	PLUS(X14, X13);
+	vmovdqa X14, (STACK_TMP1)(%rsp);
+	vpbroadcastd (15 * 4)(INPUT), X13;
+	PLUS(X15, X13);
+	vmovdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14);
+	transpose_4x4(X4, X5, X6, X7, X13, X14);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
+	vmovdqa (STACK_TMP)(%rsp), X13;
+	vmovdqa (STACK_TMP1)(%rsp), X14;
+	vmovdqa (STACK_TMP2)(%rsp), X15;
+	transpose_4x4(X8, X9, X10, X11, X0, X1);
+	transpose_4x4(X12, X13, X14, X15, X0, X1);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
+	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+
+	subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	lea (32 * 16)(POLY_RSRC), POLY_RSRC;
+	lea (8 * 64)(DST), DST;
+	lea (8 * 64)(SRC), SRC;
+	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+	movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+	jnz .Loop_poly8;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	/* clear the used vector registers and stack */
+	vpxor X0, X0, X0;
+	vmovdqa X0, (STACK_VEC_X12)(%rsp);
+	vmovdqa X0, (STACK_VEC_X13)(%rsp);
+	vmovdqa X0, (STACK_TMP)(%rsp);
+	vmovdqa X0, (STACK_TMP1)(%rsp);
+	vmovdqa X0, (STACK_TMP2)(%rsp);
+	vzeroall;
+
+	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
+	  .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
+
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 0e59ff98..d7faf644 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -1,449 +1,1008 @@
 /* chacha20-amd64-ssse3.S  -  SSSE3 implementation of ChaCha20 cipher
  *
- * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 .text
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
 
 /* register macros */
 #define INPUT %rdi
 #define DST   %rsi
 #define SRC   %rdx
 #define NBLKS %rcx
 #define ROUND %eax
 
 /* stack structure */
 #define STACK_VEC_X12 (16)
 #define STACK_VEC_X13 (16 + STACK_VEC_X12)
 #define STACK_TMP     (16 + STACK_VEC_X13)
 #define STACK_TMP1    (16 + STACK_TMP)
 #define STACK_TMP2    (16 + STACK_TMP1)
 
 #define STACK_MAX     (16 + STACK_TMP2)
 
 /* vector registers */
 #define X0 %xmm0
 #define X1 %xmm1
 #define X2 %xmm2
 #define X3 %xmm3
 #define X4 %xmm4
 #define X5 %xmm5
 #define X6 %xmm6
 #define X7 %xmm7
 #define X8 %xmm8
 #define X9 %xmm9
 #define X10 %xmm10
 #define X11 %xmm11
 #define X12 %xmm12
 #define X13 %xmm13
 #define X14 %xmm14
 #define X15 %xmm15
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
 	movdqa    x0, t2; \
 	punpckhdq x1, t2; \
 	punpckldq x1, x0; \
 	\
 	movdqa    x2, t1; \
 	punpckldq x3, t1; \
 	punpckhdq x3, x2; \
 	\
 	movdqa     x0, x1; \
 	punpckhqdq t1, x1; \
 	punpcklqdq t1, x0; \
 	\
 	movdqa     t2, x3; \
 	punpckhqdq x2, x3; \
 	punpcklqdq x2, t2; \
 	movdqa     t2, x2;
 
 /* fill xmm register with 32-bit value from memory */
 #define pbroadcastd(mem32, xreg) \
 	movd mem32, xreg; \
 	pshufd $0, xreg, xreg;
 
 /* xor with unaligned memory operand */
 #define pxor_u(umem128, xreg, t) \
 	movdqu umem128, t; \
 	pxor t, xreg;
 
 /* xor register with unaligned src and save to unaligned dst */
 #define xor_src_dst(dst, src, offset, xreg, t) \
 	pxor_u(offset(src), xreg, t); \
 	movdqu xreg, offset(dst);
 
 #define clear(x) pxor x,x;
 
 /**********************************************************************
   4-way chacha20
  **********************************************************************/
 
 #define ROTATE2(v1,v2,c,tmp1,tmp2)	\
 	movdqa v1, tmp1; 		\
 	movdqa v2, tmp2; 		\
 	psrld $(32 - (c)), v1;		\
 	pslld $(c), tmp1;		\
 	paddb tmp1, v1;			\
 	psrld $(32 - (c)), v2;		\
 	pslld $(c), tmp2;		\
 	paddb tmp2, v2;
 
 #define ROTATE_SHUF_2(v1,v2,shuf)	\
 	pshufb shuf, v1;		\
 	pshufb shuf, v2;
 
 #define XOR(ds,s) \
 	pxor s, ds;
 
 #define PLUS(ds,s) \
 	paddd s, ds;
 
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)	\
-	movdqa .Lshuf_rol16 RIP, tmp1;				\
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
+		      interleave_op1,interleave_op2)		\
+	movdqa .Lshuf_rol16 rRIP, tmp1;				\
+		interleave_op1;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
-	movdqa .Lshuf_rol8 RIP, tmp1;				\
+	movdqa .Lshuf_rol8 rRIP, tmp1;				\
+		interleave_op2;					\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1, tmp2);
 
 chacha20_data:
 .align 16
 .Lshuf_rol16:
 	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
 	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
 .Lcounter1:
 	.long 1,0,0,0
 .Linc_counter:
 	.long 0,1,2,3
 .Lunsigned_cmp:
 	.long 0x80000000,0x80000000,0x80000000,0x80000000
 
 .align 8
 .globl _gcry_chacha20_amd64_ssse3_blocks4
 ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;)
 
 _gcry_chacha20_amd64_ssse3_blocks4:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 4)
 	 */
 
 	pushq %rbp;
 	movq %rsp, %rbp;
 
 	subq $STACK_MAX, %rsp;
 	andq $~15, %rsp;
 
 .Loop4:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
-	movdqa .Linc_counter RIP, X0;
-	movdqa .Lunsigned_cmp RIP, X2;
+	movdqa .Linc_counter rRIP, X0;
+	movdqa .Lunsigned_cmp rRIP, X2;
 	pbroadcastd((12 * 4)(INPUT), X12);
 	pbroadcastd((13 * 4)(INPUT), X13);
 	paddd X0, X12;
 	movdqa X12, X1;
 	pxor X2, X0;
 	pxor X2, X1;
 	pcmpgtd X1, X0;
 	psubd X0, X13;
 	movdqa X12, (STACK_VEC_X12)(%rsp);
 	movdqa X13, (STACK_VEC_X13)(%rsp);
 
 	/* Load vectors */
 	pbroadcastd((0 * 4)(INPUT), X0);
 	pbroadcastd((1 * 4)(INPUT), X1);
 	pbroadcastd((2 * 4)(INPUT), X2);
 	pbroadcastd((3 * 4)(INPUT), X3);
 	pbroadcastd((4 * 4)(INPUT), X4);
 	pbroadcastd((5 * 4)(INPUT), X5);
 	pbroadcastd((6 * 4)(INPUT), X6);
 	pbroadcastd((7 * 4)(INPUT), X7);
 	pbroadcastd((8 * 4)(INPUT), X8);
 	pbroadcastd((9 * 4)(INPUT), X9);
 	pbroadcastd((10 * 4)(INPUT), X10);
 	pbroadcastd((11 * 4)(INPUT), X11);
 	pbroadcastd((14 * 4)(INPUT), X14);
 	pbroadcastd((15 * 4)(INPUT), X15);
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 
 .Lround2_4:
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
 	sub $2, ROUND;
 	jnz .Lround2_4;
 
 	/* tmp := X15 */
 	movdqa (STACK_TMP)(%rsp), X11;
 	pbroadcastd((0 * 4)(INPUT), X15);
 	PLUS(X0, X15);
 	pbroadcastd((1 * 4)(INPUT), X15);
 	PLUS(X1, X15);
 	pbroadcastd((2 * 4)(INPUT), X15);
 	PLUS(X2, X15);
 	pbroadcastd((3 * 4)(INPUT), X15);
 	PLUS(X3, X15);
 	pbroadcastd((4 * 4)(INPUT), X15);
 	PLUS(X4, X15);
 	pbroadcastd((5 * 4)(INPUT), X15);
 	PLUS(X5, X15);
 	pbroadcastd((6 * 4)(INPUT), X15);
 	PLUS(X6, X15);
 	pbroadcastd((7 * 4)(INPUT), X15);
 	PLUS(X7, X15);
 	pbroadcastd((8 * 4)(INPUT), X15);
 	PLUS(X8, X15);
 	pbroadcastd((9 * 4)(INPUT), X15);
 	PLUS(X9, X15);
 	pbroadcastd((10 * 4)(INPUT), X15);
 	PLUS(X10, X15);
 	pbroadcastd((11 * 4)(INPUT), X15);
 	PLUS(X11, X15);
 	movdqa (STACK_VEC_X12)(%rsp), X15;
 	PLUS(X12, X15);
 	movdqa (STACK_VEC_X13)(%rsp), X15;
 	PLUS(X13, X15);
 	movdqa X13, (STACK_TMP)(%rsp);
 	pbroadcastd((14 * 4)(INPUT), X15);
 	PLUS(X14, X15);
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X14, (STACK_TMP1)(%rsp);
 	pbroadcastd((15 * 4)(INPUT), X13);
 	PLUS(X15, X13);
 	movdqa X15, (STACK_TMP2)(%rsp);
 
 	/* Update counter */
 	addq $4, (12 * 4)(INPUT);
 
 	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
 	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
 	movdqa (STACK_TMP)(%rsp), X13;
 	movdqa (STACK_TMP1)(%rsp), X14;
 	movdqa (STACK_TMP2)(%rsp), X15;
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
 	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
 	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
 
 	sub $4, NBLKS;
 	lea (4 * 64)(DST), DST;
 	lea (4 * 64)(SRC), SRC;
 	jnz .Loop4;
 
 	/* clear the used vector registers and stack */
 	clear(X0);
 	movdqa X0, (STACK_VEC_X12)(%rsp);
 	movdqa X0, (STACK_VEC_X13)(%rsp);
 	movdqa X0, (STACK_TMP)(%rsp);
 	movdqa X0, (STACK_TMP1)(%rsp);
 	movdqa X0, (STACK_TMP2)(%rsp);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X8);
 	clear(X9);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 	clear(X14);
 	clear(X15);
 
 	/* eax zeroed by round loop. */
 	leave;
 	ret;
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
 /**********************************************************************
   1-way chacha20
  **********************************************************************/
 
 #define ROTATE_SHUF(v1,shuf)		\
 	pshufb shuf, v1;
 
 #define ROTATE(v1,c,tmp1)		\
 	movdqa v1, tmp1; 		\
 	psrld $(32 - (c)), v1;		\
 	pslld $(c), tmp1;		\
 	paddb tmp1, v1;
 
 #define WORD_SHUF(v1,shuf)		\
 	pshufd $shuf, v1, v1;
 
 #define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\
 		      shuf_x2,shuf_x3) \
 	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \
 	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \
 	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \
 	PLUS(x2, x3); \
 	  WORD_SHUF(x3, shuf_x3); \
 		      XOR(x1, x2); \
 	  WORD_SHUF(x2, shuf_x2); \
 				   ROTATE(x1, 7, tmp1); \
 	  WORD_SHUF(x1, shuf_x1);
 
 .align 8
 .globl _gcry_chacha20_amd64_ssse3_blocks1
 ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;)
 
 _gcry_chacha20_amd64_ssse3_blocks1:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks
 	 */
 
 	/* Load constants */
-	movdqa .Lcounter1 RIP, X4;
-	movdqa .Lshuf_rol8 RIP, X5;
-	movdqa .Lshuf_rol16 RIP, X6;
+	movdqa .Lcounter1 rRIP, X4;
+	movdqa .Lshuf_rol8 rRIP, X5;
+	movdqa .Lshuf_rol16 rRIP, X6;
 
 	/* Load state */
 	movdqu (0 * 4)(INPUT), X10;
 	movdqu (4 * 4)(INPUT), X11;
 	movdqu (8 * 4)(INPUT), X12;
 	movdqu (12 * 4)(INPUT), X13;
 
 .Loop1:
 	mov $20, ROUND;
 
 	movdqa X10, X0;
 	movdqa X11, X1;
 	movdqa X12, X2;
 	movdqa X13, X3;
 
 .Lround2_1:
 	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 	sub $2, ROUND;
 	jnz .Lround2_1;
 
 	PLUS(X0, X10);
 	PLUS(X1, X11);
 	PLUS(X2, X12);
 	PLUS(X3, X13);
 
 	/* Update counter */
 	paddq X4, X13;
 
 	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
 	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
 	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
 	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
 
 	lea (64)(DST), DST;
 	lea (64)(SRC), SRC;
 
 	sub $1, NBLKS;
 	jnz .Loop1;
 
 	/* Store counter */
 	movdqu X13, (12 * 4)(INPUT);
 
 	/* clear the used vector registers */
 	clear(X0);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 
 	/* eax zeroed by round loop. */
 	ret;
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
 
+/**********************************************************************
+  4-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4,@function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks4:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 4)
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $(8 * 8) + STACK_MAX + 16, %rsp;
+	andq $~15, %rsp;
+
+	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+
+	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	/* Load state */
+	POLY1305_LOAD_STATE();
+
+.Loop_poly4:
+
+	/* Construct counter vectors X12 and X13 */
+	movdqa .Linc_counter rRIP, X0;
+	movdqa .Lunsigned_cmp rRIP, X2;
+	pbroadcastd((12 * 4)(INPUT), X12);
+	pbroadcastd((13 * 4)(INPUT), X13);
+	paddd X0, X12;
+	movdqa X12, X1;
+	pxor X2, X0;
+	pxor X2, X1;
+	pcmpgtd X1, X0;
+	psubd X0, X13;
+	movdqa X12, (STACK_VEC_X12)(%rsp);
+	movdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	pbroadcastd((0 * 4)(INPUT), X0);
+	pbroadcastd((1 * 4)(INPUT), X1);
+	pbroadcastd((2 * 4)(INPUT), X2);
+	pbroadcastd((3 * 4)(INPUT), X3);
+	pbroadcastd((4 * 4)(INPUT), X4);
+	pbroadcastd((5 * 4)(INPUT), X5);
+	pbroadcastd((6 * 4)(INPUT), X6);
+	pbroadcastd((7 * 4)(INPUT), X7);
+	pbroadcastd((8 * 4)(INPUT), X8);
+	pbroadcastd((9 * 4)(INPUT), X9);
+	pbroadcastd((10 * 4)(INPUT), X10);
+	pbroadcastd((11 * 4)(INPUT), X11);
+	pbroadcastd((14 * 4)(INPUT), X14);
+	pbroadcastd((15 * 4)(INPUT), X15);
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+
+	/* rounds 0,1 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(0 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(1 * 16))
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	/* rounds 2,3 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(2 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(3 * 16))
+
+	/* rounds 4,5 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(4 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	/* rounds 6,7 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(5 * 16))
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(6 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	/* rounds 8,9 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(7 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	/* rounds 10,11 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(8 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(9 * 16))
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+
+	/* rounds 12,13 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(10 * 16),
+		      POLY1305_BLOCK_PART2())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(11 * 16))
+
+	/* rounds 14,15 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART1(12 * 16),
+		      POLY1305_BLOCK_PART2())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+
+	/* rounds 16,17 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(13 * 16))
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART1(14 * 16),
+		      POLY1305_BLOCK_PART2())
+
+	/* rounds 18,19 */
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART1(15 * 16))
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	/* tmp := X15 */
+	movdqa (STACK_TMP)(%rsp), X11;
+	pbroadcastd((0 * 4)(INPUT), X15);
+	PLUS(X0, X15);
+	pbroadcastd((1 * 4)(INPUT), X15);
+	PLUS(X1, X15);
+	pbroadcastd((2 * 4)(INPUT), X15);
+	PLUS(X2, X15);
+	pbroadcastd((3 * 4)(INPUT), X15);
+	PLUS(X3, X15);
+	pbroadcastd((4 * 4)(INPUT), X15);
+	PLUS(X4, X15);
+	pbroadcastd((5 * 4)(INPUT), X15);
+	PLUS(X5, X15);
+	pbroadcastd((6 * 4)(INPUT), X15);
+	PLUS(X6, X15);
+	pbroadcastd((7 * 4)(INPUT), X15);
+	PLUS(X7, X15);
+	pbroadcastd((8 * 4)(INPUT), X15);
+	PLUS(X8, X15);
+	pbroadcastd((9 * 4)(INPUT), X15);
+	PLUS(X9, X15);
+	pbroadcastd((10 * 4)(INPUT), X15);
+	PLUS(X10, X15);
+	pbroadcastd((11 * 4)(INPUT), X15);
+	PLUS(X11, X15);
+	movdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	movdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	movdqa X13, (STACK_TMP)(%rsp);
+	pbroadcastd((14 * 4)(INPUT), X15);
+	PLUS(X14, X15);
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X14, (STACK_TMP1)(%rsp);
+	pbroadcastd((15 * 4)(INPUT), X13);
+	PLUS(X15, X13);
+	movdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+	movdqa (STACK_TMP)(%rsp), X13;
+	movdqa (STACK_TMP1)(%rsp), X14;
+	movdqa (STACK_TMP2)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+	subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	lea (16 * 16)(POLY_RSRC), POLY_RSRC;
+	lea (4 * 64)(DST), DST;
+	lea (4 * 64)(SRC), SRC;
+	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+	movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+	jnz .Loop_poly4;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	movdqa X0, (STACK_VEC_X12)(%rsp);
+	movdqa X0, (STACK_VEC_X13)(%rsp);
+	movdqa X0, (STACK_TMP)(%rsp);
+	movdqa X0, (STACK_TMP1)(%rsp);
+	movdqa X0, (STACK_TMP2)(%rsp);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
+	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
+
+/**********************************************************************
+  1-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1,@function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: chacha20-state
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $(8 * 8), %rsp;
+	movq %rbx, (0 * 8)(%rsp);
+	movq %r12, (1 * 8)(%rsp);
+	movq %r13, (2 * 8)(%rsp);
+	movq %r14, (3 * 8)(%rsp);
+	movq %r15, (4 * 8)(%rsp);
+
+	movq %rdx, (5 * 8)(%rsp); # SRC
+	movq %rsi, (6 * 8)(%rsp); # DST
+	movq %rcx, (7 * 8)(%rsp); # NBLKS
+
+	/* Load constants */
+	movdqa .Lcounter1 rRIP, X4;
+	movdqa .Lshuf_rol8 rRIP, X5;
+	movdqa .Lshuf_rol16 rRIP, X6;
+
+	/* Load state */
+	movdqu (0 * 4)(INPUT), X10;
+	movdqu (4 * 4)(INPUT), X11;
+	movdqu (8 * 4)(INPUT), X12;
+	movdqu (12 * 4)(INPUT), X13;
+
+	POLY1305_LOAD_STATE();
+
+.Loop_poly1:
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+	/* Process one ChaCha20 block and four Poly1305 blocks. */
+	POLY1305_BLOCK_PART1(0 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(1 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART1(2 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(3 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	movq (5 * 8)(%rsp), SRC;
+	movq (6 * 8)(%rsp), DST;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+	subq $1, (7 * 8)(%rsp); # NBLKS
+	lea (64)(POLY_RSRC), POLY_RSRC;
+	lea (64)(SRC), SRC;
+	lea (64)(DST), DST;
+	movq SRC, (5 * 8)(%rsp);
+	movq DST, (6 * 8)(%rsp);
+
+	jnz .Loop_poly1;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	movdqu X13, (12 * 4)(INPUT);
+
+	/* clear the used vector registers */
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+
+	movq (0 * 8)(%rsp), %rbx;
+	movq (1 * 8)(%rsp), %r12;
+	movq (2 * 8)(%rsp), %r13;
+	movq (3 * 8)(%rsp), %r14;
+	movq (4 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave;
+	ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
+	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
+
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index f1afd18e..3e6327da 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,640 +1,969 @@
 /* chacha20.c  -  Bernstein's ChaCha20 cipher
- * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2014,2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * For a description of the algorithm, see:
  *   http://cr.yp.to/chacha.html
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
+#include "cipher-internal.h"
 #include "bufhelp.h"
 
 
 #define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
 #define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
 #define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
 #define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
 #define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
 #define CHACHA20_CTR_SIZE     16        /* Bytes.  */
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX2 1
 #endif
 
 /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
 #undef USE_ARMV7_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_ARMV7_NEON 1
 # endif
 #endif
 
 /* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
  * code. */
 #undef USE_AARCH64_SIMD
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(__AARCH64EL__) \
        && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
        && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
 #  define USE_AARCH64_SIMD 1
 # endif
 #endif
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
 #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
 # define ASM_FUNC_ABI __attribute__((sysv_abi))
 #else
 # define ASM_FUNC_ABI
 #endif
 
 
 typedef struct CHACHA20_context_s
 {
   u32 input[16];
   unsigned char pad[CHACHA20_BLOCK_SIZE];
   unsigned int unused; /* bytes in the pad.  */
   int use_ssse3:1;
   int use_avx2:1;
   int use_neon:1;
 } CHACHA20_context_t;
 
 
 #ifdef USE_SSSE3
 
 unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
 unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
 
 unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
 					       const byte *src,
 					       size_t nblks) ASM_FUNC_ABI;
 
+unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
 #endif /* USE_AVX2 */
 
 #ifdef USE_ARMV7_NEON
 
 unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
 					       const byte *src,
 					       size_t nblks);
 
 #endif /* USE_ARMV7_NEON */
 
 #ifdef USE_AARCH64_SIMD
 
 unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
 					    const byte *src, size_t nblks);
 
 #endif /* USE_AARCH64_SIMD */
 
 
 static const char *selftest (void);
 
 
 #define ROTATE(v,c)	(rol(v,c))
 #define XOR(v,w)	((v) ^ (w))
 #define PLUS(v,w)	((u32)((v) + (w)))
 #define PLUSONE(v)	(PLUS((v),1))
 
 #define QUARTERROUND(a,b,c,d) \
   a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
   c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
   a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
   c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
 
 #define BUF_XOR_LE32(dst, src, offset, x) \
   buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
 
 static unsigned int
 do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 {
   u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
   unsigned int i;
 
   while (nblks)
     {
       x0 = input[0];
       x1 = input[1];
       x2 = input[2];
       x3 = input[3];
       x4 = input[4];
       x5 = input[5];
       x6 = input[6];
       x7 = input[7];
       x8 = input[8];
       x9 = input[9];
       x10 = input[10];
       x11 = input[11];
       x12 = input[12];
       x13 = input[13];
       x14 = input[14];
       x15 = input[15];
 
       for (i = 20; i > 0; i -= 2)
 	{
 	  QUARTERROUND(x0, x4,  x8, x12)
 	  QUARTERROUND(x1, x5,  x9, x13)
 	  QUARTERROUND(x2, x6, x10, x14)
 	  QUARTERROUND(x3, x7, x11, x15)
 	  QUARTERROUND(x0, x5, x10, x15)
 	  QUARTERROUND(x1, x6, x11, x12)
 	  QUARTERROUND(x2, x7,  x8, x13)
 	  QUARTERROUND(x3, x4,  x9, x14)
 	}
 
       x0 = PLUS(x0, input[0]);
       x1 = PLUS(x1, input[1]);
       x2 = PLUS(x2, input[2]);
       x3 = PLUS(x3, input[3]);
       x4 = PLUS(x4, input[4]);
       x5 = PLUS(x5, input[5]);
       x6 = PLUS(x6, input[6]);
       x7 = PLUS(x7, input[7]);
       x8 = PLUS(x8, input[8]);
       x9 = PLUS(x9, input[9]);
       x10 = PLUS(x10, input[10]);
       x11 = PLUS(x11, input[11]);
       x12 = PLUS(x12, input[12]);
       x13 = PLUS(x13, input[13]);
       x14 = PLUS(x14, input[14]);
       x15 = PLUS(x15, input[15]);
 
       input[12] = PLUSONE(input[12]);
       input[13] = PLUS(input[13], !input[12]);
 
       BUF_XOR_LE32(dst, src, 0, x0);
       BUF_XOR_LE32(dst, src, 4, x1);
       BUF_XOR_LE32(dst, src, 8, x2);
       BUF_XOR_LE32(dst, src, 12, x3);
       BUF_XOR_LE32(dst, src, 16, x4);
       BUF_XOR_LE32(dst, src, 20, x5);
       BUF_XOR_LE32(dst, src, 24, x6);
       BUF_XOR_LE32(dst, src, 28, x7);
       BUF_XOR_LE32(dst, src, 32, x8);
       BUF_XOR_LE32(dst, src, 36, x9);
       BUF_XOR_LE32(dst, src, 40, x10);
       BUF_XOR_LE32(dst, src, 44, x11);
       BUF_XOR_LE32(dst, src, 48, x12);
       BUF_XOR_LE32(dst, src, 52, x13);
       BUF_XOR_LE32(dst, src, 56, x14);
       BUF_XOR_LE32(dst, src, 60, x15);
 
       src += CHACHA20_BLOCK_SIZE;
       dst += CHACHA20_BLOCK_SIZE;
       nblks--;
     }
 
   /* burn_stack */
   return (17 * sizeof(u32) + 6 * sizeof(void *));
 }
 
 
 static unsigned int
 chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
 		 size_t nblks)
 {
 #ifdef USE_SSSE3
   if (ctx->use_ssse3)
     {
       return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
     }
 #endif
 
   return do_chacha20_blocks (ctx->input, dst, src, nblks);
 }
 
 
 static void
 chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
                    unsigned int keylen)
 {
   static const char sigma[16] = "expand 32-byte k";
   static const char tau[16] = "expand 16-byte k";
   const char *constants;
 
   ctx->input[4] = buf_get_le32(key + 0);
   ctx->input[5] = buf_get_le32(key + 4);
   ctx->input[6] = buf_get_le32(key + 8);
   ctx->input[7] = buf_get_le32(key + 12);
   if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
     {
       key += 16;
       constants = sigma;
     }
   else /* 128 bits */
     {
       constants = tau;
     }
   ctx->input[8] = buf_get_le32(key + 0);
   ctx->input[9] = buf_get_le32(key + 4);
   ctx->input[10] = buf_get_le32(key + 8);
   ctx->input[11] = buf_get_le32(key + 12);
   ctx->input[0] = buf_get_le32(constants + 0);
   ctx->input[1] = buf_get_le32(constants + 4);
   ctx->input[2] = buf_get_le32(constants + 8);
   ctx->input[3] = buf_get_le32(constants + 12);
 }
 
 
 static void
 chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
 {
   if (ivlen == CHACHA20_CTR_SIZE)
     {
       ctx->input[12] = buf_get_le32 (iv + 0);
       ctx->input[13] = buf_get_le32 (iv + 4);
       ctx->input[14] = buf_get_le32 (iv + 8);
       ctx->input[15] = buf_get_le32 (iv + 12);
     }
   else if (ivlen == CHACHA20_MAX_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = buf_get_le32 (iv + 0);
       ctx->input[14] = buf_get_le32 (iv + 4);
       ctx->input[15] = buf_get_le32 (iv + 8);
     }
   else if (ivlen == CHACHA20_MIN_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = buf_get_le32 (iv + 0);
       ctx->input[15] = buf_get_le32 (iv + 4);
     }
   else
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = 0;
       ctx->input[15] = 0;
     }
 }
 
 
 static void
 chacha20_setiv (void *context, const byte *iv, size_t ivlen)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
 
   /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
   if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
       && ivlen != CHACHA20_CTR_SIZE)
     log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
 
   if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
              || ivlen == CHACHA20_CTR_SIZE))
     chacha20_ivsetup (ctx, iv, ivlen);
   else
     chacha20_ivsetup (ctx, NULL, 0);
 
   /* Reset the unused pad bytes counter.  */
   ctx->unused = 0;
 }
 
 
 static gcry_err_code_t
 chacha20_do_setkey (CHACHA20_context_t *ctx,
                     const byte *key, unsigned int keylen)
 {
   static int initialized;
   static const char *selftest_failed;
   unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
         log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
     }
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
     return GPG_ERR_INV_KEYLEN;
 
 #ifdef USE_SSSE3
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
 #ifdef USE_ARMV7_NEON
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_AARCH64_SIMD
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 
   (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
 
   /* We default to a zero nonce.  */
   chacha20_setiv (ctx, NULL, 0);
 
   return 0;
 }
 
 
 static gcry_err_code_t
 chacha20_setkey (void *context, const byte *key, unsigned int keylen,
                  gcry_cipher_hd_t hd)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
   (void)hd;
   _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
   return rc;
 }
 
 
-static void
-chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
-                         size_t length)
+static unsigned int
+do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
+				 const byte *inbuf, size_t length)
 {
   static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
-  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   unsigned int nburn, burn = 0;
 
-  if (!length)
-    return;
-
-  if (ctx->unused)
-    {
-      unsigned char *p = ctx->pad;
-      size_t n;
-
-      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
-
-      n = ctx->unused;
-      if (n > length)
-        n = length;
-
-      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
-      length -= n;
-      outbuf += n;
-      inbuf += n;
-      ctx->unused -= n;
-
-      if (!length)
-        return;
-      gcry_assert (!ctx->unused);
-    }
-
 #ifdef USE_AVX2
   if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 8;
       nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
 						nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_SSSE3
   if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
 						 nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_ARMV7_NEON
   if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
 						nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_AARCH64_SIMD
   if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
 					     nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 
   if (length > 0)
     {
       nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
       burn = nburn > burn ? nburn : burn;
 
       buf_xor (outbuf, inbuf, ctx->pad, length);
       ctx->unused = CHACHA20_BLOCK_SIZE - length;
     }
 
-  _gcry_burn_stack (burn);
+  if (burn)
+    burn += 5 * sizeof(void *);
+
+  return burn;
+}
+
+
+static void
+chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
+                         size_t length)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+  unsigned int nburn, burn = 0;
+
+  if (!length)
+    return;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+        return;
+      gcry_assert (!ctx->unused);
+    }
+
+  nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+  burn = nburn > burn ? nburn : burn;
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
+				const byte *inbuf, size_t length)
+{
+  CHACHA20_context_t *ctx = (void *) &c->context.c;
+  unsigned int nburn, burn = 0;
+  byte *authptr = NULL;
+
+  if (!length)
+    return 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
+      burn = nburn > burn ? nburn : burn;
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+	{
+	  if (burn)
+	    _gcry_burn_stack (burn);
+
+	  return 0;
+	}
+      gcry_assert (!ctx->unused);
+    }
+
+  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+  if (0)
+    { }
+#ifdef USE_AVX2
+  else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 8 * CHACHA20_BLOCK_SIZE;
+      outbuf += 8 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+#ifdef USE_SSSE3
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 4 * CHACHA20_BLOCK_SIZE;
+      outbuf += 4 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
+    }
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 1 * CHACHA20_BLOCK_SIZE;
+      outbuf += 1 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+  if (authptr)
+    {
+      size_t authoffset = outbuf - authptr;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2 &&
+	  length >= 8 * CHACHA20_BLOCK_SIZE &&
+	  authoffset >= 8 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 8;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+		      ctx->input, outbuf, inbuf, nblocks,
+		      &c->u_mode.poly1305.ctx.state, authptr);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+#endif
+
+#ifdef USE_SSSE3
+      if (ctx->use_ssse3)
+	{
+	  if (length >= 4 * CHACHA20_BLOCK_SIZE &&
+	      authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	      nblocks -= nblocks % 4;
+
+	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+
+	  if (length >= CHACHA20_BLOCK_SIZE &&
+	      authoffset >= CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+	}
+#endif
+
+      if (authoffset > 0)
+	{
+	  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
+	  authptr += authoffset;
+	  authoffset = 0;
+	}
+
+      gcry_assert(authptr == outbuf);
+    }
+
+  while (length)
+    {
+      size_t currlen = length;
+
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+      burn = nburn > burn ? nburn : burn;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
+					  currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      length -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
+				const byte *inbuf, size_t length)
+{
+  CHACHA20_context_t *ctx = (void *) &c->context.c;
+  unsigned int nburn, burn = 0;
+
+  if (!length)
+    return 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
+      burn = nburn > burn ? nburn : burn;
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+	{
+	  if (burn)
+	    _gcry_burn_stack (burn);
+
+	  return 0;
+	}
+      gcry_assert (!ctx->unused);
+    }
+
+  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+
+      nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+			ctx->input, outbuf, inbuf, nblocks,
+			&c->u_mode.poly1305.ctx.state, inbuf);
+      burn = nburn > burn ? nburn : burn;
+
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3)
+    {
+      if (length >= 4 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 4;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+
+      if (length >= CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+    }
+#endif
+
+  while (length)
+    {
+      size_t currlen = length;
+
+      /* Since checksumming is done before decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
+					  currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      length -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return 0;
 }
 
 
 static const char *
 selftest (void)
 {
   byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
   CHACHA20_context_t *ctx;
   byte scratch[127 + 1];
   byte buf[512 + 64 + 4];
   int i;
 
   /* From draft-strombergson-chacha-test-vectors */
   static byte key_1[] = {
     0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
     0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
     0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
     0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
   };
   static const byte nonce_1[] =
     { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
   static const byte plaintext_1[127] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
   static const byte ciphertext_1[127] = {
     0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
     0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
     0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
     0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
     0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
     0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
     0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
     0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
     0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
     0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
     0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
     0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
     0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
     0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
     0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
     0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
   };
 
   /* 16-byte alignment required for amd64 implementation. */
   ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
 
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   scratch[sizeof (scratch) - 1] = 0;
   chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
   if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
     return "ChaCha20 encryption test 1 failed.";
   if (scratch[sizeof (scratch) - 1])
     return "ChaCha20 wrote too much.";
   chacha20_setkey (ctx, key_1, sizeof (key_1), NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
   if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
     return "ChaCha20 decryption test 1 failed.";
 
   for (i = 0; i < sizeof buf; i++)
     buf[i] = i;
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /*encrypt */
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   /*decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, 1);
   chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
   chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
                            buf + (sizeof buf) - 1, 1);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 2 failed.";
 
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /* encrypt */
   for (i = 0; i < sizeof buf; i++)
     chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
   /* decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 3 failed.";
 
   return NULL;
 }
 
 
 gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
   GCRY_CIPHER_CHACHA20,
   {0, 0},                       /* flags */
   "CHACHA20",                   /* name */
   NULL,                         /* aliases */
   NULL,                         /* oids */
   1,                            /* blocksize in bytes. */
   CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
   sizeof (CHACHA20_context_t),
   chacha20_setkey,
   NULL,
   NULL,
   chacha20_encrypt_stream,
   chacha20_encrypt_stream,
   NULL,
   NULL,
   chacha20_setiv
 };
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 89886962..78f05dbb 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -1,749 +1,758 @@
 /* cipher-internal.h  - Internal defs for cipher.c
  * Copyright (C) 2011 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef G10_CIPHER_INTERNAL_H
 #define G10_CIPHER_INTERNAL_H
 
 #include "./poly1305-internal.h"
 
 
 /* The maximum supported size of a block in bytes.  */
 #define MAX_BLOCKSIZE 16
 
 /* The length for an OCB block.  Although OCB supports any block
    length it does not make sense to use a 64 bit blocklen (and cipher)
    because this reduces the security margin to an unacceptable state.
    Thus we require a cipher with 128 bit blocklength.  */
 #define OCB_BLOCK_LEN  (128/8)
 
 /* The size of the pre-computed L table for OCB.  This takes the same
    size as the table used for GCM and thus we don't save anything by
    not using such a table.  */
 #define OCB_L_TABLE_SIZE 16
 
 
 /* Check the above constants.  */
 #if OCB_BLOCK_LEN > MAX_BLOCKSIZE
 # error OCB_BLOCKLEN > MAX_BLOCKSIZE
 #endif
 
 
 
 /* Magic values for the context structure.  */
 #define CTX_MAGIC_NORMAL 0x24091964
 #define CTX_MAGIC_SECURE 0x46919042
 
 /* Try to use 16 byte aligned cipher context for better performance.
    We use the aligned attribute, thus it is only possible to implement
    this with gcc.  */
 #undef NEED_16BYTE_ALIGNED_CONTEXT
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
 # define NEED_16BYTE_ALIGNED_CONTEXT 1
 #endif
 
 /* Undef this symbol to trade GCM speed for 256 bytes of memory per context */
 #define GCM_USE_TABLES 1
 
 
 /* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL
    code.  */
 #undef GCM_USE_INTEL_PCLMUL
 #if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES)
 # if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
 #  if __GNUC__ >= 4
 #   define GCM_USE_INTEL_PCLMUL 1
 #  endif
 # endif
 #endif /* GCM_USE_INTEL_PCLMUL */
 
 /* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
 #undef GCM_USE_ARM_PMULL
 #if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES)
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
 #  define GCM_USE_ARM_PMULL 1
 # elif defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 #  define GCM_USE_ARM_PMULL 1
 # endif
 #endif /* GCM_USE_ARM_PMULL */
 
 
 typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
                                     const byte *buf, size_t nblocks);
 
 
 /* A VIA processor with the Padlock engine as well as the Intel AES_NI
    instructions require an alignment of most data on a 16 byte
    boundary.  Because we trick out the compiler while allocating the
    context, the align attribute as used in rijndael.c does not work on
    its own.  Thus we need to make sure that the entire context
    structure is a aligned on that boundary.  We achieve this by
    defining a new type and use that instead of our usual alignment
    type.  */
 typedef union
 {
   PROPERLY_ALIGNED_TYPE foo;
 #ifdef NEED_16BYTE_ALIGNED_CONTEXT
   char bar[16] __attribute__ ((aligned (16)));
 #endif
   char c[1];
 } cipher_context_alignment_t;
 
 
 /* Storage structure for CMAC, for CMAC and EAX modes. */
 typedef struct {
   /* The initialization vector. Also contains tag after finalization. */
   union {
     cipher_context_alignment_t iv_align;
     unsigned char iv[MAX_BLOCKSIZE];
   } u_iv;
 
   /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
   unsigned char subkeys[2][MAX_BLOCKSIZE];
 
   /* Space to save partial input lengths for MAC. */
   unsigned char macbuf[MAX_BLOCKSIZE];
 
   int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
   unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
 } gcry_cmac_context_t;
 
 
 /* The handle structure.  */
 struct gcry_cipher_handle
 {
   int magic;
   size_t actual_handle_size;     /* Allocated size of this handle. */
   size_t handle_offset;          /* Offset to the malloced block.  */
   gcry_cipher_spec_t *spec;
 
   /* The algorithm id.  This is a hack required because the module
      interface does not easily allow to retrieve this value. */
   int algo;
 
   /* A structure with function pointers for mode operations. */
   struct {
     gcry_err_code_t (*encrypt)(gcry_cipher_hd_t c,
                                unsigned char *outbuf, size_t outbuflen,
                                const unsigned char *inbuf, size_t inbuflen);
     gcry_err_code_t (*decrypt)(gcry_cipher_hd_t c,
                                unsigned char *outbuf, size_t outbuflen,
                                const unsigned char *inbuf, size_t inbuflen);
     gcry_err_code_t (*setiv)(gcry_cipher_hd_t c, const unsigned char *iv,
                              size_t ivlen);
 
     gcry_err_code_t (*authenticate)(gcry_cipher_hd_t c,
                                     const unsigned char *abuf, size_t abuflen);
     gcry_err_code_t (*get_tag)(gcry_cipher_hd_t c, unsigned char *outtag,
                                size_t taglen);
     gcry_err_code_t (*check_tag)(gcry_cipher_hd_t c, const unsigned char *intag,
                                  size_t taglen);
   } mode_ops;
 
   /* A structure with function pointers for bulk operations.  Due to
      limitations of the module system (we don't want to change the
      API) we need to keep these function pointers here.  The cipher
      open function initializes them and the actual encryption routines
      use them if they are not NULL.  */
   struct {
     void (*cfb_enc)(void *context, unsigned char *iv,
                     void *outbuf_arg, const void *inbuf_arg,
                     size_t nblocks);
     void (*cfb_dec)(void *context, unsigned char *iv,
                     void *outbuf_arg, const void *inbuf_arg,
                     size_t nblocks);
     void (*cbc_enc)(void *context, unsigned char *iv,
                     void *outbuf_arg, const void *inbuf_arg,
                     size_t nblocks, int cbc_mac);
     void (*cbc_dec)(void *context, unsigned char *iv,
                     void *outbuf_arg, const void *inbuf_arg,
                     size_t nblocks);
     void (*ctr_enc)(void *context, unsigned char *iv,
                     void *outbuf_arg, const void *inbuf_arg,
                     size_t nblocks);
     size_t (*ocb_crypt)(gcry_cipher_hd_t c, void *outbuf_arg,
 			const void *inbuf_arg, size_t nblocks, int encrypt);
     size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg,
 		       size_t nblocks);
     void (*xts_crypt)(void *context, unsigned char *tweak,
 		      void *outbuf_arg, const void *inbuf_arg,
 		      size_t nblocks, int encrypt);
   } bulk;
 
 
   int mode;
   unsigned int flags;
 
   struct {
     unsigned int key:1; /* Set to 1 if a key has been set.  */
     unsigned int iv:1;  /* Set to 1 if a IV has been set.  */
     unsigned int tag:1; /* Set to 1 if a tag is finalized. */
     unsigned int finalize:1; /* Next encrypt/decrypt has the final data.  */
   } marks;
 
   /* The initialization vector.  For best performance we make sure
      that it is properly aligned.  In particular some implementations
      of bulk operations expect an 16 byte aligned IV.  IV is also used
      to store CBC-MAC in CCM mode; counter IV is stored in U_CTR.  For
      OCB mode it is used for the offset value.  */
   union {
     cipher_context_alignment_t iv_align;
     unsigned char iv[MAX_BLOCKSIZE];
   } u_iv;
 
   /* The counter for CTR mode.  This field is also used by AESWRAP and
      thus we can't use the U_IV union.  For OCB mode it is used for
      the checksum.  */
   union {
     cipher_context_alignment_t iv_align;
     unsigned char ctr[MAX_BLOCKSIZE];
   } u_ctr;
 
   /* Space to save an IV or CTR for chaining operations.  */
   unsigned char lastiv[MAX_BLOCKSIZE];
   int unused;  /* Number of unused bytes in LASTIV. */
 
   union {
     /* Mode specific storage for CCM mode. */
     struct {
       u64 encryptlen;
       u64 aadlen;
       unsigned int authlen;
 
       /* Space to save partial input lengths for MAC. */
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
       unsigned char s0[GCRY_CCM_BLOCK_LEN];
 
       unsigned int nonce:1; /* Set to 1 if nonce has been set.  */
       unsigned int lengths:1; /* Set to 1 if CCM length parameters has been
                                  processed.  */
     } ccm;
 
     /* Mode specific storage for Poly1305 mode. */
     struct {
       /* byte counter for AAD. */
       u32 aadcount[2];
 
       /* byte counter for data. */
       u32 datacount[2];
 
       unsigned int aad_finalized:1;
       unsigned int bytecount_over_limits:1;
 
       poly1305_context_t ctx;
     } poly1305;
 
     /* Mode specific storage for CMAC mode. */
     gcry_cmac_context_t cmac;
 
     /* Mode specific storage for EAX mode. */
     struct {
       /* CMAC for header (AAD). */
       gcry_cmac_context_t cmac_header;
 
       /* CMAC for ciphertext. */
       gcry_cmac_context_t cmac_ciphertext;
     } eax;
 
     /* Mode specific storage for GCM mode. */
     struct {
       /* The interim tag for GCM mode.  */
       union {
         cipher_context_alignment_t iv_align;
         unsigned char tag[MAX_BLOCKSIZE];
       } u_tag;
 
       /* Space to save partial input lengths for MAC. */
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
       /* byte counters for GCM */
       u32 aadlen[2];
       u32 datalen[2];
 
       /* encrypted tag counter */
       unsigned char tagiv[MAX_BLOCKSIZE];
 
       unsigned int ghash_data_finalized:1;
       unsigned int ghash_aad_finalized:1;
 
       unsigned int datalen_over_limits:1;
       unsigned int disallow_encryption_because_of_setiv_in_fips_mode:1;
 
       /* --- Following members are not cleared in gcry_cipher_reset --- */
 
       /* GHASH multiplier from key.  */
       union {
         cipher_context_alignment_t iv_align;
         unsigned char key[MAX_BLOCKSIZE];
       } u_ghash_key;
 
       /* GHASH implementation in use. */
       ghash_fn_t ghash_fn;
 
       /* Pre-calculated table for GCM. */
 #ifdef GCM_USE_TABLES
  #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
       #define GCM_TABLES_USE_U64 1
       u64 gcm_table[2 * 16];
  #else
       #undef GCM_TABLES_USE_U64
       u32 gcm_table[4 * 16];
  #endif
 #endif
     } gcm;
 
     /* Mode specific storage for OCB mode. */
     struct {
       /* Helper variables and pre-computed table of L values.  */
       unsigned char L_star[OCB_BLOCK_LEN];
       unsigned char L_dollar[OCB_BLOCK_LEN];
       unsigned char L0L1[OCB_BLOCK_LEN];
       unsigned char L0L1L0[OCB_BLOCK_LEN];
       unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN];
 
       /* The tag is valid if marks.tag has been set.  */
       unsigned char tag[OCB_BLOCK_LEN];
 
       /* A buffer to hold the offset for the AAD processing.  */
       unsigned char aad_offset[OCB_BLOCK_LEN];
 
       /* A buffer to hold the current sum of AAD processing.  We can't
          use tag here because tag may already hold the preprocessed
          checksum of the data.  */
       unsigned char aad_sum[OCB_BLOCK_LEN];
 
       /* A buffer to store AAD data not yet processed.  */
       unsigned char aad_leftover[OCB_BLOCK_LEN];
 
       /* Number of data/aad blocks processed so far.  */
       u64 data_nblocks;
       u64 aad_nblocks;
 
       /* Number of valid bytes in AAD_LEFTOVER.  */
       unsigned char aad_nleftover;
 
       /* Length of the tag.  Fixed for now but may eventually be
          specified using a set of gcry_cipher_flags.  */
       unsigned char taglen;
 
       /* Flags indicating that the final data/aad block has been
          processed.  */
       unsigned int data_finalized:1;
       unsigned int aad_finalized:1;
     } ocb;
 
     /* Mode specific storage for XTS mode. */
     struct {
       /* Pointer to tweak cipher context, allocated after actual
        * cipher context. */
       char *tweak_context;
     } xts;
   } u_mode;
 
   /* What follows are two contexts of the cipher in use.  The first
      one needs to be aligned well enough for the cipher operation
      whereas the second one is a copy created by cipher_setkey and
      used by cipher_reset.  That second copy has no need for proper
      aligment because it is only accessed by memcpy.  */
   cipher_context_alignment_t context;
 };
 
 
 /*-- cipher-cbc.c --*/
 gcry_err_code_t _gcry_cipher_cbc_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cbc_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cbc_cts_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cbc_cts_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 /*-- cipher-cfb.c --*/
 gcry_err_code_t _gcry_cipher_cfb_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cfb_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cfb8_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cfb8_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 
 /*-- cipher-ofb.c --*/
 gcry_err_code_t _gcry_cipher_ofb_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 /*-- cipher-ctr.c --*/
 gcry_err_code_t _gcry_cipher_ctr_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 
 /*-- cipher-aeswrap.c --*/
 gcry_err_code_t _gcry_cipher_aeswrap_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    byte *outbuf, size_t outbuflen,
                    const byte *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_aeswrap_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    byte *outbuf, size_t outbuflen,
                    const byte *inbuf, size_t inbuflen);
 
 
 /*-- cipher-ccm.c --*/
 gcry_err_code_t _gcry_cipher_ccm_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ccm_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ccm_set_nonce
 /*           */ (gcry_cipher_hd_t c, const unsigned char *nonce,
                  size_t noncelen);
 gcry_err_code_t _gcry_cipher_ccm_authenticate
 /*           */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen);
 gcry_err_code_t _gcry_cipher_ccm_set_lengths
 /*           */ (gcry_cipher_hd_t c, u64 encryptedlen, u64 aadlen, u64 taglen);
 gcry_err_code_t _gcry_cipher_ccm_get_tag
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_ccm_check_tag
 /*           */ (gcry_cipher_hd_t c,
                  const unsigned char *intag, size_t taglen);
 
 
 /*-- cipher-cmac.c --*/
 gcry_err_code_t _gcry_cmac_generate_subkeys
 /*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
 gcry_err_code_t _gcry_cmac_write
 /*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
 		 const byte * inbuf, size_t inlen);
 gcry_err_code_t _gcry_cmac_final
 /*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
 void _gcry_cmac_reset (gcry_cmac_context_t *ctx);
 
 
 /*-- cipher-eax.c --*/
 gcry_err_code_t _gcry_cipher_eax_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_eax_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_eax_set_nonce
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *nonce, size_t noncelen);
 gcry_err_code_t _gcry_cipher_eax_authenticate
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *aadbuf, size_t aadbuflen);
 gcry_err_code_t _gcry_cipher_eax_get_tag
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_eax_check_tag
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *intag, size_t taglen);
 gcry_err_code_t _gcry_cipher_eax_setkey
 /*           */   (gcry_cipher_hd_t c);
 
 
 /*-- cipher-gcm.c --*/
 gcry_err_code_t _gcry_cipher_gcm_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_gcm_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_gcm_setiv
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *iv, size_t ivlen);
 gcry_err_code_t _gcry_cipher_gcm_authenticate
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *aadbuf, size_t aadbuflen);
 gcry_err_code_t _gcry_cipher_gcm_get_tag
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_gcm_check_tag
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *intag, size_t taglen);
 void _gcry_cipher_gcm_setkey
 /*           */   (gcry_cipher_hd_t c);
 
 
 /*-- cipher-poly1305.c --*/
 gcry_err_code_t _gcry_cipher_poly1305_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_poly1305_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_poly1305_setiv
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *iv, size_t ivlen);
 gcry_err_code_t _gcry_cipher_poly1305_authenticate
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *aadbuf, size_t aadbuflen);
 gcry_err_code_t _gcry_cipher_poly1305_get_tag
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_poly1305_check_tag
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *intag, size_t taglen);
 void _gcry_cipher_poly1305_setkey
 /*           */   (gcry_cipher_hd_t c);
 
 
+/*-- chacha20.c --*/
+gcry_err_code_t _gcry_chacha20_poly1305_encrypt
+/*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+		   size_t length);
+gcry_err_code_t _gcry_chacha20_poly1305_decrypt
+/*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+		   size_t length);
+
+
 /*-- cipher-ocb.c --*/
 gcry_err_code_t _gcry_cipher_ocb_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ocb_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ocb_set_nonce
 /*           */ (gcry_cipher_hd_t c, const unsigned char *nonce,
                  size_t noncelen);
 gcry_err_code_t _gcry_cipher_ocb_authenticate
 /*           */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen);
 gcry_err_code_t _gcry_cipher_ocb_get_tag
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_ocb_check_tag
 /*           */ (gcry_cipher_hd_t c,
                  const unsigned char *intag, size_t taglen);
 
 
 /*-- cipher-xts.c --*/
 gcry_err_code_t _gcry_cipher_xts_encrypt
 /*           */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
 		 const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_xts_decrypt
 /*           */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
 		 const unsigned char *inbuf, size_t inbuflen);
 
 
 /* Return the L-value for block N.  Note: 'cipher_ocb.c' ensures that N
  * will never be multiple of 65536 (1 << OCB_L_TABLE_SIZE), thus N can
  * be directly passed to _gcry_ctz() function and resulting index will
  * never overflow the table.  */
 static inline const unsigned char *
 ocb_get_l (gcry_cipher_hd_t c, u64 n)
 {
   unsigned long ntz;
 
 #if ((defined(__i386__) || defined(__x86_64__)) && __GNUC__ >= 4)
   /* Assumes that N != 0. */
   asm ("rep;bsfl %k[low], %k[ntz]\n\t"
         : [ntz] "=r" (ntz)
         : [low] "r" ((unsigned long)n)
         : "cc");
 #else
   ntz = _gcry_ctz (n);
 #endif
 
   return c->u_mode.ocb.L[ntz];
 }
 
 
 /* Return bit-shift of blocksize. */
 static inline unsigned int _gcry_blocksize_shift(gcry_cipher_hd_t c)
 {
   /* Only blocksizes 8 and 16 are used. Return value in such way
    * that compiler can optimize calling functions based on this.  */
   return c->spec->blocksize == 8 ? 3 : 4;
 }
 
 
 /* Optimized function for cipher block copying */
 static inline void
 cipher_block_cpy(void *_dst, const void *_src, size_t blocksize)
 {
   byte *dst = _dst;
   const byte *src = _src;
   u64 s[2];
 
   if (blocksize == 8)
     {
       buf_put_he64(dst + 0, buf_get_he64(src + 0));
     }
   else /* blocksize == 16 */
     {
       s[0] = buf_get_he64(src + 0);
       s[1] = buf_get_he64(src + 8);
       buf_put_he64(dst + 0, s[0]);
       buf_put_he64(dst + 8, s[1]);
     }
 }
 
 
 /* Optimized function for cipher block xoring */
 static inline void
 cipher_block_xor(void *_dst, const void *_src1, const void *_src2,
                  size_t blocksize)
 {
   byte *dst = _dst;
   const byte *src1 = _src1;
   const byte *src2 = _src2;
   u64 s1[2];
   u64 s2[2];
 
   if (blocksize == 8)
     {
       buf_put_he64(dst + 0, buf_get_he64(src1 + 0) ^ buf_get_he64(src2 + 0));
     }
   else /* blocksize == 16 */
     {
       s1[0] = buf_get_he64(src1 + 0);
       s1[1] = buf_get_he64(src1 + 8);
       s2[0] = buf_get_he64(src2 + 0);
       s2[1] = buf_get_he64(src2 + 8);
       buf_put_he64(dst + 0, s1[0] ^ s2[0]);
       buf_put_he64(dst + 8, s1[1] ^ s2[1]);
     }
 }
 
 
 /* Optimized function for in-place cipher block xoring */
 static inline void
 cipher_block_xor_1(void *_dst, const void *_src, size_t blocksize)
 {
   cipher_block_xor (_dst, _dst, _src, blocksize);
 }
 
 
 /* Optimized function for cipher block xoring with two destination cipher
    blocks.  Used mainly by CFB mode encryption.  */
 static inline void
 cipher_block_xor_2dst(void *_dst1, void *_dst2, const void *_src,
                       size_t blocksize)
 {
   byte *dst1 = _dst1;
   byte *dst2 = _dst2;
   const byte *src = _src;
   u64 d2[2];
   u64 s[2];
 
   if (blocksize == 8)
     {
       d2[0] = buf_get_he64(dst2 + 0) ^ buf_get_he64(src + 0);
       buf_put_he64(dst2 + 0, d2[0]);
       buf_put_he64(dst1 + 0, d2[0]);
     }
   else /* blocksize == 16 */
     {
       s[0] = buf_get_he64(src + 0);
       s[1] = buf_get_he64(src + 8);
       d2[0] = buf_get_he64(dst2 + 0);
       d2[1] = buf_get_he64(dst2 + 8);
       d2[0] = d2[0] ^ s[0];
       d2[1] = d2[1] ^ s[1];
       buf_put_he64(dst2 + 0, d2[0]);
       buf_put_he64(dst2 + 8, d2[1]);
       buf_put_he64(dst1 + 0, d2[0]);
       buf_put_he64(dst1 + 8, d2[1]);
     }
 }
 
 
 /* Optimized function for combined cipher block xoring and copying.
    Used by mainly CBC mode decryption.  */
 static inline void
 cipher_block_xor_n_copy_2(void *_dst_xor, const void *_src_xor,
                           void *_srcdst_cpy, const void *_src_cpy,
                           size_t blocksize)
 {
   byte *dst_xor = _dst_xor;
   byte *srcdst_cpy = _srcdst_cpy;
   const byte *src_xor = _src_xor;
   const byte *src_cpy = _src_cpy;
   u64 sc[2];
   u64 sx[2];
   u64 sdc[2];
 
   if (blocksize == 8)
     {
       sc[0] = buf_get_he64(src_cpy + 0);
       buf_put_he64(dst_xor + 0,
                    buf_get_he64(srcdst_cpy + 0) ^ buf_get_he64(src_xor + 0));
       buf_put_he64(srcdst_cpy + 0, sc[0]);
     }
   else /* blocksize == 16 */
     {
       sc[0] = buf_get_he64(src_cpy + 0);
       sc[1] = buf_get_he64(src_cpy + 8);
       sx[0] = buf_get_he64(src_xor + 0);
       sx[1] = buf_get_he64(src_xor + 8);
       sdc[0] = buf_get_he64(srcdst_cpy + 0);
       sdc[1] = buf_get_he64(srcdst_cpy + 8);
       sx[0] ^= sdc[0];
       sx[1] ^= sdc[1];
       buf_put_he64(dst_xor + 0, sx[0]);
       buf_put_he64(dst_xor + 8, sx[1]);
       buf_put_he64(srcdst_cpy + 0, sc[0]);
       buf_put_he64(srcdst_cpy + 8, sc[1]);
     }
 }
 
 
 /* Optimized function for combined cipher block xoring and copying.
    Used by mainly CFB mode decryption.  */
 static inline void
 cipher_block_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src,
                         size_t blocksize)
 {
   cipher_block_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, blocksize);
 }
 
 
 #endif /*G10_CIPHER_INTERNAL_H*/
diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c
index 607586b5..bb475236 100644
--- a/cipher/cipher-poly1305.c
+++ b/cipher/cipher-poly1305.c
@@ -1,365 +1,375 @@
 /* cipher-poly1305.c  -  Poly1305 based AEAD cipher mode, RFC-8439
  * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 #include "./poly1305-internal.h"
 
 
 static inline int
 poly1305_bytecounter_add (u32 ctr[2], size_t add)
 {
   int overflow = 0;
 
   if (sizeof(add) > sizeof(u32))
     {
       u32 high_add = ((add >> 31) >> 1) & 0xffffffff;
       ctr[1] += high_add;
       if (ctr[1] < high_add)
         overflow = 1;
     }
 
   ctr[0] += add;
   if (ctr[0] >= add)
     return overflow;
 
   ctr[1] += 1;
   return (ctr[1] < 1) || overflow;
 }
 
 
 static void
 poly1305_fill_bytecounts (gcry_cipher_hd_t c)
 {
   u32 lenbuf[4];
 
   lenbuf[0] = le_bswap32(c->u_mode.poly1305.aadcount[0]);
   lenbuf[1] = le_bswap32(c->u_mode.poly1305.aadcount[1]);
   lenbuf[2] = le_bswap32(c->u_mode.poly1305.datacount[0]);
   lenbuf[3] = le_bswap32(c->u_mode.poly1305.datacount[1]);
   _gcry_poly1305_update (&c->u_mode.poly1305.ctx, (byte*)lenbuf,
 			 sizeof(lenbuf));
 
   wipememory(lenbuf, sizeof(lenbuf));
 }
 
 
 static void
 poly1305_do_padding (gcry_cipher_hd_t c, u32 ctr[2])
 {
   static const byte zero_padding_buf[15] = {};
   u32 padding_count;
 
   /* Padding to 16 byte boundary. */
   if (ctr[0] % 16 > 0)
     {
       padding_count = 16 - ctr[0] % 16;
 
       _gcry_poly1305_update (&c->u_mode.poly1305.ctx, zero_padding_buf,
 			     padding_count);
     }
 }
 
 
 static void
 poly1305_aad_finish (gcry_cipher_hd_t c)
 {
   /* After AAD, feed padding bytes so we get 16 byte alignment. */
   poly1305_do_padding (c, c->u_mode.poly1305.aadcount);
 
   /* Start of encryption marks end of AAD stream. */
   c->u_mode.poly1305.aad_finalized = 1;
 
   c->u_mode.poly1305.datacount[0] = 0;
   c->u_mode.poly1305.datacount[1] = 0;
 }
 
 
 static gcry_err_code_t
 poly1305_set_zeroiv (gcry_cipher_hd_t c)
 {
   byte zero[8] = { 0, };
 
   return _gcry_cipher_poly1305_setiv (c, zero, sizeof(zero));
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_authenticate (gcry_cipher_hd_t c,
 				    const byte * aadbuf, size_t aadbuflen)
 {
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->u_mode.poly1305.aad_finalized)
     return GPG_ERR_INV_STATE;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     poly1305_set_zeroiv(c);
 
   if (poly1305_bytecounter_add(c->u_mode.poly1305.aadcount, aadbuflen))
     {
       c->u_mode.poly1305.bytecount_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   _gcry_poly1305_update (&c->u_mode.poly1305.ctx, aadbuf, aadbuflen);
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c,
 			       byte *outbuf, size_t outbuflen,
 			       const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.iv)
     {
       err = poly1305_set_zeroiv(c);
       if (err)
         return err;
     }
 
   if (!c->u_mode.poly1305.aad_finalized)
     poly1305_aad_finish(c);
 
   if (poly1305_bytecounter_add(c->u_mode.poly1305.datacount, inbuflen))
     {
       c->u_mode.poly1305.bytecount_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
+  if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+    {
+      return _gcry_chacha20_poly1305_encrypt (c, outbuf, inbuf, inbuflen);
+    }
+
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done after encryption, process input in 24KiB
        * chunks to keep data loaded in L1 cache for checksumming. */
       if (currlen > 24 * 1024)
 	currlen = 24 * 1024;
 
       c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
 
       _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, currlen);
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c,
 			       byte *outbuf, size_t outbuflen,
 			       const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.iv)
     {
       err = poly1305_set_zeroiv(c);
       if (err)
         return err;
     }
 
   if (!c->u_mode.poly1305.aad_finalized)
     poly1305_aad_finish(c);
 
   if (poly1305_bytecounter_add(c->u_mode.poly1305.datacount, inbuflen))
     {
       c->u_mode.poly1305.bytecount_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
+  if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+    {
+      return _gcry_chacha20_poly1305_decrypt (c, outbuf, inbuf, inbuflen);
+    }
+
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done before decryption, process input in 24KiB
        * chunks to keep data loaded in L1 cache for decryption. */
       if (currlen > 24 * 1024)
 	currlen = 24 * 1024;
 
       _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, currlen);
 
       c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 static gcry_err_code_t
 _gcry_cipher_poly1305_tag (gcry_cipher_hd_t c,
 			   byte * outbuf, size_t outbuflen, int check)
 {
   gcry_err_code_t err;
 
   if (outbuflen < POLY1305_TAGLEN)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.iv)
     {
       err = poly1305_set_zeroiv(c);
       if (err)
         return err;
     }
 
   if (!c->u_mode.poly1305.aad_finalized)
     poly1305_aad_finish(c);
 
   if (!c->marks.tag)
     {
       /* After data, feed padding bytes so we get 16 byte alignment. */
       poly1305_do_padding (c, c->u_mode.poly1305.datacount);
 
       /* Write byte counts to poly1305. */
       poly1305_fill_bytecounts(c);
 
       _gcry_poly1305_finish(&c->u_mode.poly1305.ctx, c->u_iv.iv);
 
       c->marks.tag = 1;
     }
 
   if (!check)
     {
       memcpy (outbuf, c->u_iv.iv, POLY1305_TAGLEN);
     }
   else
     {
       /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
        * and thus we need to compare its length first.  */
       if (outbuflen != POLY1305_TAGLEN
           || !buf_eq_const (outbuf, c->u_iv.iv, POLY1305_TAGLEN))
         return GPG_ERR_CHECKSUM;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
                           size_t taglen)
 {
   return _gcry_cipher_poly1305_tag (c, outtag, taglen, 0);
 }
 
 gcry_err_code_t
 _gcry_cipher_poly1305_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
                             size_t taglen)
 {
   return _gcry_cipher_poly1305_tag (c, (unsigned char *) intag, taglen, 1);
 }
 
 
 void
 _gcry_cipher_poly1305_setkey (gcry_cipher_hd_t c)
 {
   c->u_mode.poly1305.aadcount[0] = 0;
   c->u_mode.poly1305.aadcount[1] = 0;
 
   c->u_mode.poly1305.datacount[0] = 0;
   c->u_mode.poly1305.datacount[1] = 0;
 
   c->u_mode.poly1305.bytecount_over_limits = 0;
   c->u_mode.poly1305.aad_finalized = 0;
   c->marks.tag = 0;
   c->marks.iv = 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
 {
   byte tmpbuf[64]; /* size of ChaCha20 block */
   gcry_err_code_t err;
 
   /* IV must be 96-bits */
   if (!iv && ivlen != (96 / 8))
     return GPG_ERR_INV_ARG;
 
   memset(&c->u_mode.poly1305.ctx, 0, sizeof(c->u_mode.poly1305.ctx));
 
   c->u_mode.poly1305.aadcount[0] = 0;
   c->u_mode.poly1305.aadcount[1] = 0;
 
   c->u_mode.poly1305.datacount[0] = 0;
   c->u_mode.poly1305.datacount[1] = 0;
 
   c->u_mode.poly1305.bytecount_over_limits = 0;
   c->u_mode.poly1305.aad_finalized = 0;
   c->marks.tag = 0;
   c->marks.iv = 0;
 
   /* Set up IV for stream cipher. */
   c->spec->setiv (&c->context.c, iv, ivlen);
 
   /* Get the first block from ChaCha20. */
   memset(tmpbuf, 0, sizeof(tmpbuf));
   c->spec->stencrypt(&c->context.c, tmpbuf, tmpbuf, sizeof(tmpbuf));
 
   /* Use the first 32-bytes as Poly1305 key. */
   err = _gcry_poly1305_init (&c->u_mode.poly1305.ctx, tmpbuf, POLY1305_KEYLEN);
 
   wipememory(tmpbuf, sizeof(tmpbuf));
 
   if (err)
     return err;
 
   c->marks.iv = 1;
   return 0;
 }
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index 2405a090..19cee5f6 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -1,62 +1,64 @@
 /* poly1305-internal.h  -  Poly1305 internals
  * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef G10_POLY1305_INTERNAL_H
 #define G10_POLY1305_INTERNAL_H
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 
 #define POLY1305_TAGLEN 16
 #define POLY1305_KEYLEN 32
 #define POLY1305_BLOCKSIZE 16
 
 
 typedef struct
 {
   u32 k[4];
   u32 r[4];
   u32 h[5];
 } POLY1305_STATE;
 
 typedef struct poly1305_context_s
 {
   POLY1305_STATE state;
   byte buffer[POLY1305_BLOCKSIZE];
   unsigned int leftover;
 } poly1305_context_t;
 
 
 gcry_err_code_t _gcry_poly1305_init (poly1305_context_t *ctx, const byte *key,
 				     size_t keylen);
 
 void _gcry_poly1305_finish (poly1305_context_t *ctx,
 			     byte mac[POLY1305_TAGLEN]);
 
 void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf,
 			     size_t buflen);
 
+unsigned int _gcry_poly1305_update_burn (poly1305_context_t *ctx,
+					 const byte *m, size_t bytes);
 
 #endif /* G10_POLY1305_INTERNAL_H */
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index 571f8286..8de6cd5e 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -1,667 +1,679 @@
 /* poly1305.c  -  Poly1305 internals and generic implementation
  * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "poly1305-internal.h"
 
 #include "mpi-internal.h"
 #include "longlong.h"
 
 
 static const char *selftest (void);
 
 
 #undef USE_MPI_64BIT
 #undef USE_MPI_32BIT
 #if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_U64_TYPEDEF)
 # define USE_MPI_64BIT 1
 #elif BYTES_PER_MPI_LIMB == 4
 # define USE_MPI_32BIT 1
 #else
 # error please implement for this limb size.
 #endif
 
 
 static void poly1305_init (poly1305_context_t *ctx,
 			   const byte key[POLY1305_KEYLEN])
 {
   POLY1305_STATE *st = &ctx->state;
 
   ctx->leftover = 0;
 
   st->h[0] = 0;
   st->h[1] = 0;
   st->h[2] = 0;
   st->h[3] = 0;
   st->h[4] = 0;
 
   st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
   st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
   st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
   st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
 
   st->k[0] = buf_get_le32(key + 16);
   st->k[1] = buf_get_le32(key + 20);
   st->k[2] = buf_get_le32(key + 24);
   st->k[3] = buf_get_le32(key + 28);
 }
 
 
 #ifdef USE_MPI_64BIT
 
 #if defined (__aarch64__) && __GNUC__ >= 4
 
 /* A += B (armv8/aarch64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
       __asm__ ("adds %0, %3, %0\n" \
 	       "adcs %1, %4, %1\n" \
 	       "adc  %2, %5, %2\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
 	       : "r" (B0), "r" (B1), "r" (B2) \
 	       : "cc" )
 
 #endif /* __aarch64__ */
 
 #if defined (__x86_64__) && __GNUC__ >= 4
 
 /* A += B (x86-64) */
 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
       __asm__ ("addq %3, %0\n" \
 	       "adcq %4, %1\n" \
 	       "adcq %5, %2\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
 	       : "g" (B0), "g" (B1), "g" (B2) \
 	       : "cc" )
 
 #endif /* __x86_64__ */
 
 #ifndef ADD_1305_64
 /* A += B (generic, mpi) */
 #  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
     u64 carry; \
     add_ssaaaa(carry, A0, 0, A0, 0, B0); \
     add_ssaaaa(A2, A1, A2, A1, B2, B1); \
     add_ssaaaa(A2, A1, A2, A1, 0, carry); \
   } while (0)
 #endif
 
 /* H = H * R mod 2¹³⁰-5 */
 #define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
     u64 x0_lo, x0_hi, x1_lo, x1_hi; \
     u64 t0_lo, t0_hi, t1_lo, t1_hi; \
     \
     /* x = a * r (partial mod 2^130-5) */ \
     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
     \
     umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
     add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
     umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
     add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
     \
     t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
     t1_hi = H2 * R0;       /* h2 * r0 */ \
     add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
     \
     /* carry propagation */ \
     H2 = H0 & 3; \
     H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
     ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
   } while (0)
 
-unsigned int
+static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u64 r0, r1, r1_mult5;
   u64 h0, h1, h2;
   u64 m0, m1, m2;
 
   m2 = high_pad;
 
   h0 = st->h[0] + ((u64)st->h[1] << 32);
   h1 = st->h[2] + ((u64)st->h[3] << 32);
   h2 = st->h[4];
 
   r0 = st->r[0] + ((u64)st->r[1] << 32);
   r1 = st->r[2] + ((u64)st->r[3] << 32);
 
   r1_mult5 = (r1 >> 2) + r1;
 
   m0 = buf_get_le64(buf + 0);
   m1 = buf_get_le64(buf + 8);
   buf += POLY1305_BLOCKSIZE;
   len -= POLY1305_BLOCKSIZE;
 
   while (len >= POLY1305_BLOCKSIZE)
     {
       /* a = h + m */
       ADD_1305_64(h2, h1, h0, m2, m1, m0);
 
       m0 = buf_get_le64(buf + 0);
       m1 = buf_get_le64(buf + 8);
 
       /* h = a * r (partial mod 2^130-5) */
       MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
       buf += POLY1305_BLOCKSIZE;
       len -= POLY1305_BLOCKSIZE;
     }
 
   /* a = h + m */
   ADD_1305_64(h2, h1, h0, m2, m1, m0);
 
   /* h = a * r (partial mod 2^130-5) */
   MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
   st->h[0] = h0;
   st->h[1] = h0 >> 32;
   st->h[2] = h1;
   st->h[3] = h1 >> 32;
   st->h[4] = h2;
 
   return 6 * sizeof (void *) + 18 * sizeof (u64);
 }
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
 				    byte mac[POLY1305_TAGLEN])
 {
   POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
   u64 u, carry;
   u64 k0, k1;
   u64 h0, h1;
   u64 h2;
 
   /* process the remaining block */
   if (ctx->leftover)
     {
       ctx->buffer[ctx->leftover++] = 1;
       for (; ctx->leftover < POLY1305_BLOCKSIZE; ctx->leftover++)
 	ctx->buffer[ctx->leftover] = 0;
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
   h0 = st->h[0] + ((u64)st->h[1] << 32);
   h1 = st->h[2] + ((u64)st->h[3] << 32);
   h2 = st->h[4];
 
   k0 = st->k[0] + ((u64)st->k[1] << 32);
   k1 = st->k[2] + ((u64)st->k[3] << 32);
 
   /* check if h is more than 2^130-5, by adding 5. */
   add_ssaaaa(carry, u, 0, h0, 0, 5);
   add_ssaaaa(carry, u, 0, carry, 0, h1);
   u = (carry + h2) >> 2; /* u == 0 or 1 */
 
   /* minus 2^130-5 ... (+5) */
   u = (-u) & 5;
   add_ssaaaa(h1, h0, h1, h0, 0, u);
 
   /* add high part of key + h */
   add_ssaaaa(h1, h0, h1, h0, k1, k0);
   buf_put_le64(mac + 0, h0);
   buf_put_le64(mac + 8, h1);
 
   /* burn_stack */
   return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
 }
 
 #endif /* USE_MPI_64BIT */
 
 #ifdef USE_MPI_32BIT
 
 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
 /* HI:LO += A * B (arm) */
 #define UMUL_ADD_32(HI, LO, A, B) \
       __asm__ ("umlal %1, %0, %4, %5" \
 	       : "=r" (HI), "=r" (LO) \
 	       : "0" (HI), "1" (LO), "r" (A), "r" (B) )
 
 /* A += B (arm) */
 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
       __asm__ ("adds %0, %0, %5\n" \
 	       "adcs %1, %1, %6\n" \
 	       "adcs %2, %2, %7\n" \
 	       "adcs %3, %3, %8\n" \
 	       "adc %4, %4, %9\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
 	       : "r" (B0), "r" (B1), "r" (B2), "r" (B3), "r" (B4) \
 	       : "cc" )
 
 #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
 
 #if defined (__i386__) && __GNUC__ >= 4
 
 /* A += B (i386) */
 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
       __asm__ ("addl %5, %0\n" \
 	       "adcl %6, %1\n" \
 	       "adcl %7, %2\n" \
 	       "adcl %8, %3\n" \
 	       "adcl %9, %4\n" \
 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
 	       : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
 	       : "cc" )
 
 #endif /* __i386__ */
 
 #ifndef UMUL_ADD_32
 /* HI:LO += A * B (generic, mpi) */
 #  define UMUL_ADD_32(HI, LO, A, B) do { \
     u32 t_lo, t_hi; \
     umul_ppmm(t_hi, t_lo, A, B); \
     add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
   } while (0)
 #endif
 
 #ifndef ADD_1305_32
 /* A += B (generic, mpi) */
 #  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
     u32 carry0, carry1, carry2; \
     add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
     add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
     add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
     add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
     add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
     add_ssaaaa(A4, A3, A4, A3, B4, B3); \
     add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
   } while (0)
 #endif
 
 /* H = H * R mod 2¹³⁰-5 */
 #define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
                         R3_MULT5, R2_MULT5, R1_MULT5) do { \
     u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
     u32 t0_lo, t0_hi; \
     \
     /* x = a * r (partial mod 2^130-5) */ \
     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
     umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
     umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
     UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
     \
     UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
     H1 = x0_hi; \
     UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
     UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
     UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
     \
     t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
     t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
     add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
     add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
     t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
     t0_hi = H4 * R0;       /* h4 * r0 */ \
     add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
     \
     /* carry propagation */ \
     H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
     H4 = H4 & 3; \
     ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
   } while (0)
 
-unsigned int
+static unsigned int
 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 		 byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u32 r1_mult5, r2_mult5, r3_mult5;
   u32 h0, h1, h2, h3, h4;
   u32 m0, m1, m2, m3, m4;
 
   m4 = high_pad;
 
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
   r1_mult5 = (st->r[1] >> 2) + st->r[1];
   r2_mult5 = (st->r[2] >> 2) + st->r[2];
   r3_mult5 = (st->r[3] >> 2) + st->r[3];
 
   while (len >= POLY1305_BLOCKSIZE)
     {
       m0 = buf_get_le32(buf + 0);
       m1 = buf_get_le32(buf + 4);
       m2 = buf_get_le32(buf + 8);
       m3 = buf_get_le32(buf + 12);
 
       /* a = h + m */
       ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
 
       /* h = a * r (partial mod 2^130-5) */
       MUL_MOD_1305_32(h4, h3, h2, h1, h0,
 		      st->r[3], st->r[2], st->r[1], st->r[0],
 		      r3_mult5, r2_mult5, r1_mult5);
 
       buf += POLY1305_BLOCKSIZE;
       len -= POLY1305_BLOCKSIZE;
     }
 
   st->h[0] = h0;
   st->h[1] = h1;
   st->h[2] = h2;
   st->h[3] = h3;
   st->h[4] = h4;
 
   return 6 * sizeof (void *) + 28 * sizeof (u32);
 }
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
 				    byte mac[POLY1305_TAGLEN])
 {
   POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
   u32 carry, tmp0, tmp1, tmp2, u;
   u32 h4, h3, h2, h1, h0;
 
   /* process the remaining block */
   if (ctx->leftover)
     {
       ctx->buffer[ctx->leftover++] = 1;
       for (; ctx->leftover < POLY1305_BLOCKSIZE; ctx->leftover++)
 	ctx->buffer[ctx->leftover] = 0;
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
   /* check if h is more than 2^130-5, by adding 5. */
   add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
   add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
   u = (carry + h4) >> 2; /* u == 0 or 1 */
 
   /* minus 2^130-5 ... (+5) */
   u = (-u) & 5;
   add_ssaaaa(carry, h0, 0, h0, 0, u);
   add_ssaaaa(carry, h1, 0, h1, 0, carry);
   add_ssaaaa(carry, h2, 0, h2, 0, carry);
   add_ssaaaa(carry, h3, 0, h3, 0, carry);
 
   /* add high part of key + h */
   add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
   add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
   add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
   add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
   add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
   add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
   h3 += tmp2;
 
   buf_put_le32(mac + 0, h0);
   buf_put_le32(mac + 4, h1);
   buf_put_le32(mac + 8, h2);
   buf_put_le32(mac + 12, h3);
 
   /* burn_stack */
   return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
 }
 
 #endif /* USE_MPI_32BIT */
 
 
-void
-_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
+unsigned int
+_gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
+			    size_t bytes)
 {
   unsigned int burn = 0;
 
   /* handle leftover */
   if (ctx->leftover)
     {
       size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
       if (want > bytes)
 	want = bytes;
       buf_cpy (ctx->buffer + ctx->leftover, m, want);
       bytes -= want;
       m += want;
       ctx->leftover += want;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
-	return;
+	return 0;
       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
       ctx->leftover = 0;
     }
 
   /* process full blocks */
   if (bytes >= POLY1305_BLOCKSIZE)
     {
       size_t nblks = bytes / POLY1305_BLOCKSIZE;
       burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
       m += nblks * POLY1305_BLOCKSIZE;
       bytes -= nblks * POLY1305_BLOCKSIZE;
     }
 
   /* store leftover */
   if (bytes)
     {
       buf_cpy (ctx->buffer + ctx->leftover, m, bytes);
       ctx->leftover += bytes;
     }
 
+  return burn;
+}
+
+
+void
+_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
+{
+  unsigned int burn;
+
+  burn = _gcry_poly1305_update_burn (ctx, m, bytes);
+
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 void
 _gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
 {
   unsigned int burn;
 
   burn = poly1305_final (ctx, mac);
 
   _gcry_burn_stack (burn);
 }
 
 
 gcry_err_code_t
 _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
 		     size_t keylen)
 {
   static int initialized;
   static const char *selftest_failed;
 
   if (!initialized)
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
 	log_error ("Poly1305 selftest failed (%s)\n", selftest_failed);
     }
 
   if (keylen != POLY1305_KEYLEN)
     return GPG_ERR_INV_KEYLEN;
 
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   poly1305_init (ctx, key);
 
   return 0;
 }
 
 
 static void
 poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes,
 	       const byte * key)
 {
   poly1305_context_t ctx;
 
   memset (&ctx, 0, sizeof (ctx));
 
   _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN);
   _gcry_poly1305_update (&ctx, m, bytes);
   _gcry_poly1305_finish (&ctx, mac);
 
   wipememory (&ctx, sizeof (ctx));
 }
 
 
 static const char *
 selftest (void)
 {
   /* example from nacl */
   static const byte nacl_key[POLY1305_KEYLEN] = {
     0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
     0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
     0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
     0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80,
   };
 
   static const byte nacl_msg[131] = {
     0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
     0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
     0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
     0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
     0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
     0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
     0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
     0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
     0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
     0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
     0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
     0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
     0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
     0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
     0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
     0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
     0xe3, 0x55, 0xa5
   };
 
   static const byte nacl_mac[16] = {
     0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
     0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
   };
 
   /* generates a final value of (2^130 - 2) == 3 */
   static const byte wrap_key[POLY1305_KEYLEN] = {
     0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
 
   static const byte wrap_msg[16] = {
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   };
 
   static const byte wrap_mac[16] = {
     0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
 
   /* mac of the macs of messages of length 0 to 256, where the key and messages
    * have all their values set to the length
    */
   static const byte total_key[POLY1305_KEYLEN] = {
     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
     0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   };
 
   static const byte total_mac[16] = {
     0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd,
     0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39
   };
 
   poly1305_context_t ctx;
   poly1305_context_t total_ctx;
   byte all_key[POLY1305_KEYLEN];
   byte all_msg[256];
   byte mac[16];
   size_t i, j;
 
   memset (&ctx, 0, sizeof (ctx));
   memset (&total_ctx, 0, sizeof (total_ctx));
 
   memset (mac, 0, sizeof (mac));
   poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key);
   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 1 failed.";
 
   /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so
    * make sure everything still works varying between them */
   memset (mac, 0, sizeof (mac));
   _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN);
   _gcry_poly1305_update (&ctx, nacl_msg + 0, 32);
   _gcry_poly1305_update (&ctx, nacl_msg + 32, 64);
   _gcry_poly1305_update (&ctx, nacl_msg + 96, 16);
   _gcry_poly1305_update (&ctx, nacl_msg + 112, 8);
   _gcry_poly1305_update (&ctx, nacl_msg + 120, 4);
   _gcry_poly1305_update (&ctx, nacl_msg + 124, 2);
   _gcry_poly1305_update (&ctx, nacl_msg + 126, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 127, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 128, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 129, 1);
   _gcry_poly1305_update (&ctx, nacl_msg + 130, 1);
   _gcry_poly1305_finish (&ctx, mac);
   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 2 failed.";
 
   memset (mac, 0, sizeof (mac));
   poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key);
   if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0)
     return "Poly1305 test 3 failed.";
 
   _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN);
   for (i = 0; i < 256; i++)
     {
       /* set key and message to 'i,i,i..' */
       for (j = 0; j < sizeof (all_key); j++)
 	all_key[j] = i;
       for (j = 0; j < i; j++)
 	all_msg[j] = i;
       poly1305_auth (mac, all_msg, i, all_key);
       _gcry_poly1305_update (&total_ctx, mac, 16);
     }
   _gcry_poly1305_finish (&total_ctx, mac);
   if (memcmp (total_mac, mac, sizeof (total_mac)) != 0)
     return "Poly1305 test 4 failed.";
 
   return NULL;
 }