diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index f2372281..0e59ff98 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -1,341 +1,449 @@
 /* chacha20-amd64-ssse3.S  -  SSSE3 implementation of ChaCha20 cipher
  *
  * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 .text
 
 #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
 # define ELF(...) __VA_ARGS__
 #else
 # define ELF(...) /*_*/
 #endif
 
 #ifdef __PIC__
 #  define RIP (%rip)
 #else
 #  define RIP
 #endif
 
 /* register macros */
 #define INPUT %rdi
 #define DST   %rsi
 #define SRC   %rdx
 #define NBLKS %rcx
 #define ROUND %eax
 
 /* stack structure */
 #define STACK_VEC_X12 (16)
 #define STACK_VEC_X13 (16 + STACK_VEC_X12)
 #define STACK_TMP     (16 + STACK_VEC_X13)
 #define STACK_TMP1    (16 + STACK_TMP)
 #define STACK_TMP2    (16 + STACK_TMP1)
 
 #define STACK_MAX     (16 + STACK_TMP2)
 
 /* vector registers */
 #define X0 %xmm0
 #define X1 %xmm1
 #define X2 %xmm2
 #define X3 %xmm3
 #define X4 %xmm4
 #define X5 %xmm5
 #define X6 %xmm6
 #define X7 %xmm7
 #define X8 %xmm8
 #define X9 %xmm9
 #define X10 %xmm10
 #define X11 %xmm11
 #define X12 %xmm12
 #define X13 %xmm13
 #define X14 %xmm14
 #define X15 %xmm15
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
 /* 4x4 32-bit integer matrix transpose */
 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
 	movdqa    x0, t2; \
 	punpckhdq x1, t2; \
 	punpckldq x1, x0; \
 	\
 	movdqa    x2, t1; \
 	punpckldq x3, t1; \
 	punpckhdq x3, x2; \
 	\
 	movdqa     x0, x1; \
 	punpckhqdq t1, x1; \
 	punpcklqdq t1, x0; \
 	\
 	movdqa     t2, x3; \
 	punpckhqdq x2, x3; \
 	punpcklqdq x2, t2; \
 	movdqa     t2, x2;
 
 /* fill xmm register with 32-bit value from memory */
 #define pbroadcastd(mem32, xreg) \
 	movd mem32, xreg; \
 	pshufd $0, xreg, xreg;
 
 /* xor with unaligned memory operand */
 #define pxor_u(umem128, xreg, t) \
 	movdqu umem128, t; \
 	pxor t, xreg;
 
 /* xor register with unaligned src and save to unaligned dst */
 #define xor_src_dst(dst, src, offset, xreg, t) \
 	pxor_u(offset(src), xreg, t); \
 	movdqu xreg, offset(dst);
 
 #define clear(x) pxor x,x;
 
 /**********************************************************************
   4-way chacha20
  **********************************************************************/
 
 #define ROTATE2(v1,v2,c,tmp1,tmp2)	\
 	movdqa v1, tmp1; 		\
 	movdqa v2, tmp2; 		\
 	psrld $(32 - (c)), v1;		\
 	pslld $(c), tmp1;		\
 	paddb tmp1, v1;			\
 	psrld $(32 - (c)), v2;		\
 	pslld $(c), tmp2;		\
 	paddb tmp2, v2;
 
 #define ROTATE_SHUF_2(v1,v2,shuf)	\
 	pshufb shuf, v1;		\
 	pshufb shuf, v2;
 
 #define XOR(ds,s) \
 	pxor s, ds;
 
 #define PLUS(ds,s) \
 	paddd s, ds;
 
 #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)	\
 	movdqa .Lshuf_rol16 RIP, tmp1;				\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
 	movdqa .Lshuf_rol8 RIP, tmp1;				\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE_SHUF_2(d1, d2, tmp1);			\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1, tmp2);
 
 chacha20_data:
 .align 16
 .Lshuf_rol16:
 	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
 	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Lcounter1:
+	.long 1,0,0,0
 .Linc_counter:
 	.long 0,1,2,3
 .Lunsigned_cmp:
 	.long 0x80000000,0x80000000,0x80000000,0x80000000
 
 .align 8
 .globl _gcry_chacha20_amd64_ssse3_blocks4
 ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;)
 
 _gcry_chacha20_amd64_ssse3_blocks4:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 4)
 	 */
 
 	pushq %rbp;
 	movq %rsp, %rbp;
 
 	subq $STACK_MAX, %rsp;
 	andq $~15, %rsp;
 
 .Loop4:
 	mov $20, ROUND;
 
 	/* Construct counter vectors X12 and X13 */
 	movdqa .Linc_counter RIP, X0;
 	movdqa .Lunsigned_cmp RIP, X2;
 	pbroadcastd((12 * 4)(INPUT), X12);
 	pbroadcastd((13 * 4)(INPUT), X13);
 	paddd X0, X12;
 	movdqa X12, X1;
 	pxor X2, X0;
 	pxor X2, X1;
 	pcmpgtd X1, X0;
 	psubd X0, X13;
 	movdqa X12, (STACK_VEC_X12)(%rsp);
 	movdqa X13, (STACK_VEC_X13)(%rsp);
 
 	/* Load vectors */
 	pbroadcastd((0 * 4)(INPUT), X0);
 	pbroadcastd((1 * 4)(INPUT), X1);
 	pbroadcastd((2 * 4)(INPUT), X2);
 	pbroadcastd((3 * 4)(INPUT), X3);
 	pbroadcastd((4 * 4)(INPUT), X4);
 	pbroadcastd((5 * 4)(INPUT), X5);
 	pbroadcastd((6 * 4)(INPUT), X6);
 	pbroadcastd((7 * 4)(INPUT), X7);
 	pbroadcastd((8 * 4)(INPUT), X8);
 	pbroadcastd((9 * 4)(INPUT), X9);
 	pbroadcastd((10 * 4)(INPUT), X10);
 	pbroadcastd((11 * 4)(INPUT), X11);
 	pbroadcastd((14 * 4)(INPUT), X14);
 	pbroadcastd((15 * 4)(INPUT), X15);
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 
-.Lround2:
+.Lround2_4:
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
 	sub $2, ROUND;
-	jnz .Lround2;
+	jnz .Lround2_4;
 
 	/* tmp := X15 */
 	movdqa (STACK_TMP)(%rsp), X11;
 	pbroadcastd((0 * 4)(INPUT), X15);
 	PLUS(X0, X15);
 	pbroadcastd((1 * 4)(INPUT), X15);
 	PLUS(X1, X15);
 	pbroadcastd((2 * 4)(INPUT), X15);
 	PLUS(X2, X15);
 	pbroadcastd((3 * 4)(INPUT), X15);
 	PLUS(X3, X15);
 	pbroadcastd((4 * 4)(INPUT), X15);
 	PLUS(X4, X15);
 	pbroadcastd((5 * 4)(INPUT), X15);
 	PLUS(X5, X15);
 	pbroadcastd((6 * 4)(INPUT), X15);
 	PLUS(X6, X15);
 	pbroadcastd((7 * 4)(INPUT), X15);
 	PLUS(X7, X15);
 	pbroadcastd((8 * 4)(INPUT), X15);
 	PLUS(X8, X15);
 	pbroadcastd((9 * 4)(INPUT), X15);
 	PLUS(X9, X15);
 	pbroadcastd((10 * 4)(INPUT), X15);
 	PLUS(X10, X15);
 	pbroadcastd((11 * 4)(INPUT), X15);
 	PLUS(X11, X15);
 	movdqa (STACK_VEC_X12)(%rsp), X15;
 	PLUS(X12, X15);
 	movdqa (STACK_VEC_X13)(%rsp), X15;
 	PLUS(X13, X15);
 	movdqa X13, (STACK_TMP)(%rsp);
 	pbroadcastd((14 * 4)(INPUT), X15);
 	PLUS(X14, X15);
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X14, (STACK_TMP1)(%rsp);
 	pbroadcastd((15 * 4)(INPUT), X13);
 	PLUS(X15, X13);
 	movdqa X15, (STACK_TMP2)(%rsp);
 
 	/* Update counter */
 	addq $4, (12 * 4)(INPUT);
 
 	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
 	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
 	movdqa (STACK_TMP)(%rsp), X13;
 	movdqa (STACK_TMP1)(%rsp), X14;
 	movdqa (STACK_TMP2)(%rsp), X15;
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
 	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
 	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
 	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
 	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
 	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
 	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
 
 	sub $4, NBLKS;
 	lea (4 * 64)(DST), DST;
 	lea (4 * 64)(SRC), SRC;
 	jnz .Loop4;
 
 	/* clear the used vector registers and stack */
 	clear(X0);
 	movdqa X0, (STACK_VEC_X12)(%rsp);
 	movdqa X0, (STACK_VEC_X13)(%rsp);
 	movdqa X0, (STACK_TMP)(%rsp);
 	movdqa X0, (STACK_TMP1)(%rsp);
 	movdqa X0, (STACK_TMP2)(%rsp);
 	clear(X1);
 	clear(X2);
 	clear(X3);
 	clear(X4);
 	clear(X5);
 	clear(X6);
 	clear(X7);
 	clear(X8);
 	clear(X9);
 	clear(X10);
 	clear(X11);
 	clear(X12);
 	clear(X13);
 	clear(X14);
 	clear(X15);
 
 	/* eax zeroed by round loop. */
 	leave;
 	ret;
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
+/**********************************************************************
+  1-way chacha20
+ **********************************************************************/
+
+#define ROTATE_SHUF(v1,shuf)		\
+	pshufb shuf, v1;
+
+#define ROTATE(v1,c,tmp1)		\
+	movdqa v1, tmp1; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;
+
+#define WORD_SHUF(v1,shuf)		\
+	pshufd $shuf, v1, v1;
+
+#define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\
+		      shuf_x2,shuf_x3) \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \
+	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \
+	PLUS(x2, x3); \
+	  WORD_SHUF(x3, shuf_x3); \
+		      XOR(x1, x2); \
+	  WORD_SHUF(x2, shuf_x2); \
+				   ROTATE(x1, 7, tmp1); \
+	  WORD_SHUF(x1, shuf_x1);
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;)
+
+_gcry_chacha20_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks
+	 */
+
+	/* Load constants */
+	movdqa .Lcounter1 RIP, X4;
+	movdqa .Lshuf_rol8 RIP, X5;
+	movdqa .Lshuf_rol16 RIP, X6;
+
+	/* Load state */
+	movdqu (0 * 4)(INPUT), X10;
+	movdqu (4 * 4)(INPUT), X11;
+	movdqu (8 * 4)(INPUT), X12;
+	movdqu (12 * 4)(INPUT), X13;
+
+.Loop1:
+	mov $20, ROUND;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+.Lround2_1:
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	sub $2, ROUND;
+	jnz .Lround2_1;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+	lea (64)(DST), DST;
+	lea (64)(SRC), SRC;
+
+	sub $1, NBLKS;
+	jnz .Loop1;
+
+	/* Store counter */
+	movdqu X13, (12 * 4)(INPUT);
+
+	/* clear the used vector registers */
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+
+	/* eax zeroed by round loop. */
+	ret;
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
+	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
+
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 84a9b2b8..f1afd18e 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,621 +1,640 @@
 /* chacha20.c  -  Bernstein's ChaCha20 cipher
  * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * For a description of the algorithm, see:
  *   http://cr.yp.to/chacha.html
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 
 
 #define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
 #define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
 #define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
 #define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
 #define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
 #define CHACHA20_CTR_SIZE     16        /* Bytes.  */
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX2 1
 #endif
 
 /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
 #undef USE_ARMV7_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_ARMV7_NEON 1
 # endif
 #endif
 
 /* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
  * code. */
 #undef USE_AARCH64_SIMD
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(__AARCH64EL__) \
        && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
        && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
 #  define USE_AARCH64_SIMD 1
 # endif
 #endif
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
 #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
 # define ASM_FUNC_ABI __attribute__((sysv_abi))
 #else
 # define ASM_FUNC_ABI
 #endif
 
 
 typedef struct CHACHA20_context_s
 {
   u32 input[16];
   unsigned char pad[CHACHA20_BLOCK_SIZE];
   unsigned int unused; /* bytes in the pad.  */
   int use_ssse3:1;
   int use_avx2:1;
   int use_neon:1;
 } CHACHA20_context_t;
 
 
 #ifdef USE_SSSE3
 
 unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
+unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
+						const byte *src,
+						size_t nblks) ASM_FUNC_ABI;
+
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
 
 unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
 					       const byte *src,
 					       size_t nblks) ASM_FUNC_ABI;
 
 #endif /* USE_AVX2 */
 
 #ifdef USE_ARMV7_NEON
 
 unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
 					       const byte *src,
 					       size_t nblks);
 
 #endif /* USE_ARMV7_NEON */
 
 #ifdef USE_AARCH64_SIMD
 
 unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
 					    const byte *src, size_t nblks);
 
 #endif /* USE_AARCH64_SIMD */
 
 
 static const char *selftest (void);
 
 
 #define ROTATE(v,c)	(rol(v,c))
 #define XOR(v,w)	((v) ^ (w))
 #define PLUS(v,w)	((u32)((v) + (w)))
 #define PLUSONE(v)	(PLUS((v),1))
 
 #define QUARTERROUND(a,b,c,d) \
   a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
   c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
   a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
   c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
 
 #define BUF_XOR_LE32(dst, src, offset, x) \
   buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
 
 static unsigned int
-chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
+do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 {
   u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
   unsigned int i;
 
   while (nblks)
     {
       x0 = input[0];
       x1 = input[1];
       x2 = input[2];
       x3 = input[3];
       x4 = input[4];
       x5 = input[5];
       x6 = input[6];
       x7 = input[7];
       x8 = input[8];
       x9 = input[9];
       x10 = input[10];
       x11 = input[11];
       x12 = input[12];
       x13 = input[13];
       x14 = input[14];
       x15 = input[15];
 
       for (i = 20; i > 0; i -= 2)
 	{
 	  QUARTERROUND(x0, x4,  x8, x12)
 	  QUARTERROUND(x1, x5,  x9, x13)
 	  QUARTERROUND(x2, x6, x10, x14)
 	  QUARTERROUND(x3, x7, x11, x15)
 	  QUARTERROUND(x0, x5, x10, x15)
 	  QUARTERROUND(x1, x6, x11, x12)
 	  QUARTERROUND(x2, x7,  x8, x13)
 	  QUARTERROUND(x3, x4,  x9, x14)
 	}
 
       x0 = PLUS(x0, input[0]);
       x1 = PLUS(x1, input[1]);
       x2 = PLUS(x2, input[2]);
       x3 = PLUS(x3, input[3]);
       x4 = PLUS(x4, input[4]);
       x5 = PLUS(x5, input[5]);
       x6 = PLUS(x6, input[6]);
       x7 = PLUS(x7, input[7]);
       x8 = PLUS(x8, input[8]);
       x9 = PLUS(x9, input[9]);
       x10 = PLUS(x10, input[10]);
       x11 = PLUS(x11, input[11]);
       x12 = PLUS(x12, input[12]);
       x13 = PLUS(x13, input[13]);
       x14 = PLUS(x14, input[14]);
       x15 = PLUS(x15, input[15]);
 
       input[12] = PLUSONE(input[12]);
       input[13] = PLUS(input[13], !input[12]);
 
       BUF_XOR_LE32(dst, src, 0, x0);
       BUF_XOR_LE32(dst, src, 4, x1);
       BUF_XOR_LE32(dst, src, 8, x2);
       BUF_XOR_LE32(dst, src, 12, x3);
       BUF_XOR_LE32(dst, src, 16, x4);
       BUF_XOR_LE32(dst, src, 20, x5);
       BUF_XOR_LE32(dst, src, 24, x6);
       BUF_XOR_LE32(dst, src, 28, x7);
       BUF_XOR_LE32(dst, src, 32, x8);
       BUF_XOR_LE32(dst, src, 36, x9);
       BUF_XOR_LE32(dst, src, 40, x10);
       BUF_XOR_LE32(dst, src, 44, x11);
       BUF_XOR_LE32(dst, src, 48, x12);
       BUF_XOR_LE32(dst, src, 52, x13);
       BUF_XOR_LE32(dst, src, 56, x14);
       BUF_XOR_LE32(dst, src, 60, x15);
 
       src += CHACHA20_BLOCK_SIZE;
       dst += CHACHA20_BLOCK_SIZE;
       nblks--;
     }
 
   /* burn_stack */
   return (17 * sizeof(u32) + 6 * sizeof(void *));
 }
 
 
+static unsigned int
+chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
+		 size_t nblks)
+{
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3)
+    {
+      return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
+    }
+#endif
+
+  return do_chacha20_blocks (ctx->input, dst, src, nblks);
+}
+
+
 static void
 chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
                    unsigned int keylen)
 {
   static const char sigma[16] = "expand 32-byte k";
   static const char tau[16] = "expand 16-byte k";
   const char *constants;
 
   ctx->input[4] = buf_get_le32(key + 0);
   ctx->input[5] = buf_get_le32(key + 4);
   ctx->input[6] = buf_get_le32(key + 8);
   ctx->input[7] = buf_get_le32(key + 12);
   if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
     {
       key += 16;
       constants = sigma;
     }
   else /* 128 bits */
     {
       constants = tau;
     }
   ctx->input[8] = buf_get_le32(key + 0);
   ctx->input[9] = buf_get_le32(key + 4);
   ctx->input[10] = buf_get_le32(key + 8);
   ctx->input[11] = buf_get_le32(key + 12);
   ctx->input[0] = buf_get_le32(constants + 0);
   ctx->input[1] = buf_get_le32(constants + 4);
   ctx->input[2] = buf_get_le32(constants + 8);
   ctx->input[3] = buf_get_le32(constants + 12);
 }
 
 
 static void
 chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
 {
   if (ivlen == CHACHA20_CTR_SIZE)
     {
       ctx->input[12] = buf_get_le32 (iv + 0);
       ctx->input[13] = buf_get_le32 (iv + 4);
       ctx->input[14] = buf_get_le32 (iv + 8);
       ctx->input[15] = buf_get_le32 (iv + 12);
     }
   else if (ivlen == CHACHA20_MAX_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = buf_get_le32 (iv + 0);
       ctx->input[14] = buf_get_le32 (iv + 4);
       ctx->input[15] = buf_get_le32 (iv + 8);
     }
   else if (ivlen == CHACHA20_MIN_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = buf_get_le32 (iv + 0);
       ctx->input[15] = buf_get_le32 (iv + 4);
     }
   else
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = 0;
       ctx->input[15] = 0;
     }
 }
 
 
 static void
 chacha20_setiv (void *context, const byte *iv, size_t ivlen)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
 
   /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
   if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
       && ivlen != CHACHA20_CTR_SIZE)
     log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
 
   if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
              || ivlen == CHACHA20_CTR_SIZE))
     chacha20_ivsetup (ctx, iv, ivlen);
   else
     chacha20_ivsetup (ctx, NULL, 0);
 
   /* Reset the unused pad bytes counter.  */
   ctx->unused = 0;
 }
 
 
 static gcry_err_code_t
 chacha20_do_setkey (CHACHA20_context_t *ctx,
                     const byte *key, unsigned int keylen)
 {
   static int initialized;
   static const char *selftest_failed;
   unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
         log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
     }
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
     return GPG_ERR_INV_KEYLEN;
 
 #ifdef USE_SSSE3
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
 #ifdef USE_ARMV7_NEON
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_AARCH64_SIMD
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 
   (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
 
   /* We default to a zero nonce.  */
   chacha20_setiv (ctx, NULL, 0);
 
   return 0;
 }
 
 
 static gcry_err_code_t
 chacha20_setkey (void *context, const byte *key, unsigned int keylen,
                  gcry_cipher_hd_t hd)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
   (void)hd;
   _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
   return rc;
 }
 
 
 static void
 chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
                          size_t length)
 {
   static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   unsigned int nburn, burn = 0;
 
   if (!length)
     return;
 
   if (ctx->unused)
     {
       unsigned char *p = ctx->pad;
       size_t n;
 
       gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
 
       n = ctx->unused;
       if (n > length)
         n = length;
 
       buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
       length -= n;
       outbuf += n;
       inbuf += n;
       ctx->unused -= n;
 
       if (!length)
         return;
       gcry_assert (!ctx->unused);
     }
 
 #ifdef USE_AVX2
   if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 8;
       nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
 						nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_SSSE3
   if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
 						 nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_ARMV7_NEON
   if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
 						nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_AARCH64_SIMD
   if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
 					     nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
-      nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks);
+      nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 
   if (length > 0)
     {
-      nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1);
+      nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
       burn = nburn > burn ? nburn : burn;
 
       buf_xor (outbuf, inbuf, ctx->pad, length);
       ctx->unused = CHACHA20_BLOCK_SIZE - length;
     }
 
   _gcry_burn_stack (burn);
 }
 
 
 static const char *
 selftest (void)
 {
   byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
   CHACHA20_context_t *ctx;
   byte scratch[127 + 1];
   byte buf[512 + 64 + 4];
   int i;
 
   /* From draft-strombergson-chacha-test-vectors */
   static byte key_1[] = {
     0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
     0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
     0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
     0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
   };
   static const byte nonce_1[] =
     { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
   static const byte plaintext_1[127] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
   static const byte ciphertext_1[127] = {
     0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
     0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
     0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
     0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
     0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
     0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
     0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
     0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
     0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
     0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
     0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
     0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
     0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
     0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
     0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
     0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
   };
 
   /* 16-byte alignment required for amd64 implementation. */
   ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
 
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   scratch[sizeof (scratch) - 1] = 0;
   chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
   if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
     return "ChaCha20 encryption test 1 failed.";
   if (scratch[sizeof (scratch) - 1])
     return "ChaCha20 wrote too much.";
   chacha20_setkey (ctx, key_1, sizeof (key_1), NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
   if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
     return "ChaCha20 decryption test 1 failed.";
 
   for (i = 0; i < sizeof buf; i++)
     buf[i] = i;
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /*encrypt */
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   /*decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, 1);
   chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
   chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
                            buf + (sizeof buf) - 1, 1);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 2 failed.";
 
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /* encrypt */
   for (i = 0; i < sizeof buf; i++)
     chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
   /* decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 3 failed.";
 
   return NULL;
 }
 
 
 gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
   GCRY_CIPHER_CHACHA20,
   {0, 0},                       /* flags */
   "CHACHA20",                   /* name */
   NULL,                         /* aliases */
   NULL,                         /* oids */
   1,                            /* blocksize in bytes. */
   CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
   sizeof (CHACHA20_context_t),
   chacha20_setkey,
   NULL,
   NULL,
   chacha20_encrypt_stream,
   chacha20_encrypt_stream,
   NULL,
   NULL,
   chacha20_setiv
 };