diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 497594a0..870cfa18 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,1306 +1,1310 @@
 /* chacha20.c  -  Bernstein's ChaCha20 cipher
  * Copyright (C) 2014,2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * For a description of the algorithm, see:
  *   http://cr.yp.to/chacha.html
  */
 
 /*
  * Based on D. J. Bernstein reference implementation at
  * http://cr.yp.to/chacha.html:
  *
  * chacha-regs.c version 20080118
  * D. J. Bernstein
  * Public domain.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "cipher-internal.h"
 #include "bufhelp.h"
 
 
 #define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
 #define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
 #define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
 #define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
 #define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
 #define CHACHA20_CTR_SIZE     16        /* Bytes.  */
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX2 1
 #endif
 
 /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
 #undef USE_ARMV7_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_ARMV7_NEON 1
 # endif
 #endif
 
 /* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
  * code. */
 #undef USE_AARCH64_SIMD
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(__AARCH64EL__) \
        && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
        && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
 #  define USE_AARCH64_SIMD 1
 # endif
 #endif
 
 /* USE_PPC_VEC indicates whether to enable PowerPC vector
  * accelerated code. */
 #undef USE_PPC_VEC
 #ifdef ENABLE_PPC_CRYPTO_SUPPORT
 # if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
      defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
 #  if __GNUC__ >= 4
 #   define USE_PPC_VEC 1
 #  endif
 # endif
 #endif
 
 /* USE_S390X_VX indicates whether to enable zSeries code. */
 #undef USE_S390X_VX
 #if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
 # if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
 #  define USE_S390X_VX 1
 # endif /* USE_S390X_VX */
 #endif
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
 #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
 # define ASM_FUNC_ABI __attribute__((sysv_abi))
 #else
 # define ASM_FUNC_ABI
 #endif
 
 
 typedef struct CHACHA20_context_s
 {
   u32 input[16];
   unsigned char pad[CHACHA20_BLOCK_SIZE];
   unsigned int unused; /* bytes in the pad.  */
   unsigned int use_ssse3:1;
   unsigned int use_avx2:1;
   unsigned int use_neon:1;
   unsigned int use_ppc:1;
   unsigned int use_s390x:1;
 } CHACHA20_context_t;
 
 
 #ifdef USE_SSSE3
 
 unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
 unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
 unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
 
 unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
 
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
 
 unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
 					       const byte *src,
 					       size_t nblks) ASM_FUNC_ABI;
 
 unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
 
 #endif /* USE_AVX2 */
 
 #ifdef USE_PPC_VEC
 
 unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
 					 const byte *src,
 					 size_t nblks);
 
 unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
 					 const byte *src,
 					 size_t nblks);
 
 #undef USE_PPC_VEC_POLY1305
 #if SIZEOF_UNSIGNED_LONG == 8
 #define USE_PPC_VEC_POLY1305 1
 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		POLY1305_STATE *st, const byte *poly1305_src);
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
 #endif /* USE_PPC_VEC */
 
 #ifdef USE_S390X_VX
 
 unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
 					     const byte *src, size_t nblks);
 
 unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
 						 const byte *src, size_t nblks);
 
 #undef USE_S390X_VX_POLY1305
 #if SIZEOF_UNSIGNED_LONG == 8
 #define USE_S390X_VX_POLY1305 1
 unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		POLY1305_STATE *st, const byte *poly1305_src);
 
 unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		POLY1305_STATE *st, const byte *poly1305_src);
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
 #endif /* USE_S390X_VX */
 
 #ifdef USE_ARMV7_NEON
 
 unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
 					       const byte *src,
 					       size_t nblks);
 
 #endif /* USE_ARMV7_NEON */
 
 #ifdef USE_AARCH64_SIMD
 
 unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
 					    const byte *src, size_t nblks);
 
 unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		void *poly1305_state, const byte *poly1305_src);
 
 #endif /* USE_AARCH64_SIMD */
 
 
 static const char *selftest (void);
 
 
 #define ROTATE(v,c)	(rol(v,c))
 #define XOR(v,w)	((v) ^ (w))
 #define PLUS(v,w)	((u32)((v) + (w)))
 #define PLUSONE(v)	(PLUS((v),1))
 
 #define QUARTERROUND(a,b,c,d) \
   a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
   c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
   a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
   c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
 
 #define BUF_XOR_LE32(dst, src, offset, x) \
   buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
 
 static unsigned int
 do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 {
   u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
   unsigned int i;
 
   while (nblks)
     {
       x0 = input[0];
       x1 = input[1];
       x2 = input[2];
       x3 = input[3];
       x4 = input[4];
       x5 = input[5];
       x6 = input[6];
       x7 = input[7];
       x8 = input[8];
       x9 = input[9];
       x10 = input[10];
       x11 = input[11];
       x12 = input[12];
       x13 = input[13];
       x14 = input[14];
       x15 = input[15];
 
       for (i = 20; i > 0; i -= 2)
 	{
 	  QUARTERROUND(x0, x4,  x8, x12)
 	  QUARTERROUND(x1, x5,  x9, x13)
 	  QUARTERROUND(x2, x6, x10, x14)
 	  QUARTERROUND(x3, x7, x11, x15)
 	  QUARTERROUND(x0, x5, x10, x15)
 	  QUARTERROUND(x1, x6, x11, x12)
 	  QUARTERROUND(x2, x7,  x8, x13)
 	  QUARTERROUND(x3, x4,  x9, x14)
 	}
 
       x0 = PLUS(x0, input[0]);
       x1 = PLUS(x1, input[1]);
       x2 = PLUS(x2, input[2]);
       x3 = PLUS(x3, input[3]);
       x4 = PLUS(x4, input[4]);
       x5 = PLUS(x5, input[5]);
       x6 = PLUS(x6, input[6]);
       x7 = PLUS(x7, input[7]);
       x8 = PLUS(x8, input[8]);
       x9 = PLUS(x9, input[9]);
       x10 = PLUS(x10, input[10]);
       x11 = PLUS(x11, input[11]);
       x12 = PLUS(x12, input[12]);
       x13 = PLUS(x13, input[13]);
       x14 = PLUS(x14, input[14]);
       x15 = PLUS(x15, input[15]);
 
       input[12] = PLUSONE(input[12]);
       input[13] = PLUS(input[13], !input[12]);
 
       BUF_XOR_LE32(dst, src, 0, x0);
       BUF_XOR_LE32(dst, src, 4, x1);
       BUF_XOR_LE32(dst, src, 8, x2);
       BUF_XOR_LE32(dst, src, 12, x3);
       BUF_XOR_LE32(dst, src, 16, x4);
       BUF_XOR_LE32(dst, src, 20, x5);
       BUF_XOR_LE32(dst, src, 24, x6);
       BUF_XOR_LE32(dst, src, 28, x7);
       BUF_XOR_LE32(dst, src, 32, x8);
       BUF_XOR_LE32(dst, src, 36, x9);
       BUF_XOR_LE32(dst, src, 40, x10);
       BUF_XOR_LE32(dst, src, 44, x11);
       BUF_XOR_LE32(dst, src, 48, x12);
       BUF_XOR_LE32(dst, src, 52, x13);
       BUF_XOR_LE32(dst, src, 56, x14);
       BUF_XOR_LE32(dst, src, 60, x15);
 
       src += CHACHA20_BLOCK_SIZE;
       dst += CHACHA20_BLOCK_SIZE;
       nblks--;
     }
 
   /* burn_stack */
   return (17 * sizeof(u32) + 6 * sizeof(void *));
 }
 
 
 static unsigned int
 chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
 		 size_t nblks)
 {
 #ifdef USE_SSSE3
   if (ctx->use_ssse3)
     {
       return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
     }
 #endif
 
 #ifdef USE_PPC_VEC
   if (ctx->use_ppc)
     {
       return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
     }
 #endif
 
 #ifdef USE_S390X_VX
   if (ctx->use_s390x)
     {
       return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks);
     }
 #endif
 
   return do_chacha20_blocks (ctx->input, dst, src, nblks);
 }
 
 
 static void
 chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
                    unsigned int keylen)
 {
   static const char sigma[16] = "expand 32-byte k";
   static const char tau[16] = "expand 16-byte k";
   const char *constants;
 
   ctx->input[4] = buf_get_le32(key + 0);
   ctx->input[5] = buf_get_le32(key + 4);
   ctx->input[6] = buf_get_le32(key + 8);
   ctx->input[7] = buf_get_le32(key + 12);
   if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
     {
       key += 16;
       constants = sigma;
     }
   else /* 128 bits */
     {
       constants = tau;
     }
   ctx->input[8] = buf_get_le32(key + 0);
   ctx->input[9] = buf_get_le32(key + 4);
   ctx->input[10] = buf_get_le32(key + 8);
   ctx->input[11] = buf_get_le32(key + 12);
   ctx->input[0] = buf_get_le32(constants + 0);
   ctx->input[1] = buf_get_le32(constants + 4);
   ctx->input[2] = buf_get_le32(constants + 8);
   ctx->input[3] = buf_get_le32(constants + 12);
 }
 
 
 static void
 chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
 {
   if (ivlen == CHACHA20_CTR_SIZE)
     {
       ctx->input[12] = buf_get_le32 (iv + 0);
       ctx->input[13] = buf_get_le32 (iv + 4);
       ctx->input[14] = buf_get_le32 (iv + 8);
       ctx->input[15] = buf_get_le32 (iv + 12);
     }
   else if (ivlen == CHACHA20_MAX_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = buf_get_le32 (iv + 0);
       ctx->input[14] = buf_get_le32 (iv + 4);
       ctx->input[15] = buf_get_le32 (iv + 8);
     }
   else if (ivlen == CHACHA20_MIN_IV_SIZE)
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = buf_get_le32 (iv + 0);
       ctx->input[15] = buf_get_le32 (iv + 4);
     }
   else
     {
       ctx->input[12] = 0;
       ctx->input[13] = 0;
       ctx->input[14] = 0;
       ctx->input[15] = 0;
     }
 }
 
 
 static void
 chacha20_setiv (void *context, const byte *iv, size_t ivlen)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
 
   /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
   if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
       && ivlen != CHACHA20_CTR_SIZE)
     log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
 
   if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
              || ivlen == CHACHA20_CTR_SIZE))
     chacha20_ivsetup (ctx, iv, ivlen);
   else
     chacha20_ivsetup (ctx, NULL, 0);
 
   /* Reset the unused pad bytes counter.  */
   ctx->unused = 0;
 }
 
 
 static gcry_err_code_t
 chacha20_do_setkey (CHACHA20_context_t *ctx,
                     const byte *key, unsigned int keylen)
 {
   static int initialized;
   static const char *selftest_failed;
   unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
         log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
     }
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
     return GPG_ERR_INV_KEYLEN;
 
 #ifdef USE_SSSE3
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
 #ifdef USE_ARMV7_NEON
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_AARCH64_SIMD
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_PPC_VEC
   ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
 #endif
 #ifdef USE_S390X_VX
   ctx->use_s390x = (features & HWF_S390X_VX) != 0;
 #endif
 
   (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
 
   /* We default to a zero nonce.  */
   chacha20_setiv (ctx, NULL, 0);
 
   return 0;
 }
 
 
 static gcry_err_code_t
 chacha20_setkey (void *context, const byte *key, unsigned int keylen,
                  cipher_bulk_ops_t *bulk_ops)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
   (void)bulk_ops;
   _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
   return rc;
 }
 
 
 static unsigned int
 do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
 				 const byte *inbuf, size_t length)
 {
   static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
   unsigned int nburn, burn = 0;
 
 #ifdef USE_AVX2
   if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 8;
       nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
 						nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_SSSE3
   if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
 						 nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_ARMV7_NEON
   if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
 						nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_AARCH64_SIMD
   if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
 					     nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_PPC_VEC
   if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
       nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_S390X_VX
   if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 8;
       nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf,
 					      nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 
   if (length > 0)
     {
       nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
       burn = nburn > burn ? nburn : burn;
 
       buf_xor (outbuf, inbuf, ctx->pad, length);
       ctx->unused = CHACHA20_BLOCK_SIZE - length;
     }
 
   if (burn)
     burn += 5 * sizeof(void *);
 
   return burn;
 }
 
 
 static void
 chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
                          size_t length)
 {
   CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
   unsigned int nburn, burn = 0;
 
   if (!length)
     return;
 
   if (ctx->unused)
     {
       unsigned char *p = ctx->pad;
       size_t n;
 
       gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
 
       n = ctx->unused;
       if (n > length)
         n = length;
 
       buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
       length -= n;
       outbuf += n;
       inbuf += n;
       ctx->unused -= n;
 
       if (!length)
         return;
       gcry_assert (!ctx->unused);
     }
 
   nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
   burn = nburn > burn ? nburn : burn;
 
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 gcry_err_code_t
 _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
 				const byte *inbuf, size_t length)
 {
   CHACHA20_context_t *ctx = (void *) &c->context.c;
   unsigned int nburn, burn = 0;
   byte *authptr = NULL;
 
   if (!length)
     return 0;
 
   if (ctx->unused)
     {
       unsigned char *p = ctx->pad;
       size_t n;
 
       gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
 
       n = ctx->unused;
       if (n > length)
         n = length;
 
       buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
       nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
       burn = nburn > burn ? nburn : burn;
       length -= n;
       outbuf += n;
       inbuf += n;
       ctx->unused -= n;
 
       if (!length)
 	{
 	  if (burn)
 	    _gcry_burn_stack (burn);
 
 	  return 0;
 	}
       gcry_assert (!ctx->unused);
     }
 
   gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
 
   if (0)
     { }
 #ifdef USE_AVX2
   else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
       nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 8 * CHACHA20_BLOCK_SIZE;
       outbuf += 8 * CHACHA20_BLOCK_SIZE;
       inbuf  += 8 * CHACHA20_BLOCK_SIZE;
     }
 #endif
 #ifdef USE_SSSE3
   else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 4 * CHACHA20_BLOCK_SIZE;
       outbuf += 4 * CHACHA20_BLOCK_SIZE;
       inbuf  += 4 * CHACHA20_BLOCK_SIZE;
     }
   else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2)
     {
       nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 2 * CHACHA20_BLOCK_SIZE;
       outbuf += 2 * CHACHA20_BLOCK_SIZE;
       inbuf  += 2 * CHACHA20_BLOCK_SIZE;
     }
   else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
     {
       nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 1 * CHACHA20_BLOCK_SIZE;
       outbuf += 1 * CHACHA20_BLOCK_SIZE;
       inbuf  += 1 * CHACHA20_BLOCK_SIZE;
     }
 #endif
 #ifdef USE_AARCH64_SIMD
   else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 4 * CHACHA20_BLOCK_SIZE;
       outbuf += 4 * CHACHA20_BLOCK_SIZE;
       inbuf  += 4 * CHACHA20_BLOCK_SIZE;
     }
 #endif
 #ifdef USE_PPC_VEC_POLY1305
   else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 4 * CHACHA20_BLOCK_SIZE;
       outbuf += 4 * CHACHA20_BLOCK_SIZE;
       inbuf  += 4 * CHACHA20_BLOCK_SIZE;
     }
 #endif
 #ifdef USE_S390X_VX_POLY1305
   else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8)
     {
       nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 8 * CHACHA20_BLOCK_SIZE;
       outbuf += 8 * CHACHA20_BLOCK_SIZE;
       inbuf  += 8 * CHACHA20_BLOCK_SIZE;
     }
   else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4)
     {
       nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 4 * CHACHA20_BLOCK_SIZE;
       outbuf += 4 * CHACHA20_BLOCK_SIZE;
       inbuf  += 4 * CHACHA20_BLOCK_SIZE;
     }
   else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2)
     {
       nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 2 * CHACHA20_BLOCK_SIZE;
       outbuf += 2 * CHACHA20_BLOCK_SIZE;
       inbuf  += 2 * CHACHA20_BLOCK_SIZE;
     }
   else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE)
     {
       nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
       length -= 1 * CHACHA20_BLOCK_SIZE;
       outbuf += 1 * CHACHA20_BLOCK_SIZE;
       inbuf  += 1 * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
   if (authptr)
     {
       size_t authoffset = outbuf - authptr;
 
 #ifdef USE_AVX2
       if (ctx->use_avx2 &&
 	  length >= 8 * CHACHA20_BLOCK_SIZE &&
 	  authoffset >= 8 * CHACHA20_BLOCK_SIZE)
 	{
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	  nblocks -= nblocks % 8;
 
 	  nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
 		      ctx->input, outbuf, inbuf, nblocks,
 		      &c->u_mode.poly1305.ctx.state, authptr);
 	  burn = nburn > burn ? nburn : burn;
 
 	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
 	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
 	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
 	}
 #endif
 
 #ifdef USE_SSSE3
       if (ctx->use_ssse3)
 	{
 	  if (length >= 4 * CHACHA20_BLOCK_SIZE &&
 	      authoffset >= 4 * CHACHA20_BLOCK_SIZE)
 	    {
 	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	      nblocks -= nblocks % 4;
 
 	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
 			  ctx->input, outbuf, inbuf, nblocks,
 			  &c->u_mode.poly1305.ctx.state, authptr);
 	      burn = nburn > burn ? nburn : burn;
 
 	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
 	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
 	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
 	    }
 
 	  if (length >= CHACHA20_BLOCK_SIZE &&
 	      authoffset >= CHACHA20_BLOCK_SIZE)
 	    {
 	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 
 	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
 			  ctx->input, outbuf, inbuf, nblocks,
 			  &c->u_mode.poly1305.ctx.state, authptr);
 	      burn = nburn > burn ? nburn : burn;
 
 	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
 	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
 	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
 	    }
 	}
 #endif
 
 #ifdef USE_AARCH64_SIMD
       if (ctx->use_neon &&
 	  length >= 4 * CHACHA20_BLOCK_SIZE &&
 	  authoffset >= 4 * CHACHA20_BLOCK_SIZE)
 	{
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	  nblocks -= nblocks % 4;
 
 	  nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
 		      ctx->input, outbuf, inbuf, nblocks,
 		      &c->u_mode.poly1305.ctx.state, authptr);
 	  burn = nburn > burn ? nburn : burn;
 
 	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
 	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
 	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
 	}
 #endif
 
 #ifdef USE_PPC_VEC_POLY1305
       if (ctx->use_ppc &&
 	  length >= 4 * CHACHA20_BLOCK_SIZE &&
 	  authoffset >= 4 * CHACHA20_BLOCK_SIZE)
 	{
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	  nblocks -= nblocks % 4;
 
 	  nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
 		      ctx->input, outbuf, inbuf, nblocks,
 		      &c->u_mode.poly1305.ctx.state, authptr);
 	  burn = nburn > burn ? nburn : burn;
 
 	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
 	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
 	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
 	}
 #endif
 
 #ifdef USE_S390X_VX_POLY1305
       if (ctx->use_s390x)
 	{
 	  if (length >= 8 * CHACHA20_BLOCK_SIZE &&
 	      authoffset >= 8 * CHACHA20_BLOCK_SIZE)
 	    {
 	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	      nblocks -= nblocks % 8;
 
 	      burn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
 			  ctx->input, outbuf, inbuf, nblocks,
 			  &c->u_mode.poly1305.ctx.state, authptr);
 	      burn = nburn > burn ? nburn : burn;
 
 	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
 	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
 	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
 	    }
 
 	  if (length >= CHACHA20_BLOCK_SIZE &&
 	      authoffset >= CHACHA20_BLOCK_SIZE)
 	    {
 	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 
 	      burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
 			  ctx->input, outbuf, inbuf, nblocks,
 			  &c->u_mode.poly1305.ctx.state, authptr);
 	      burn = nburn > burn ? nburn : burn;
 
 	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
 	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
 	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
 	    }
 	}
 #endif
 
       if (authoffset > 0)
 	{
 	  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
 	  authptr += authoffset;
 	  authoffset = 0;
 	}
 
       gcry_assert(authptr == outbuf);
     }
 
   while (length)
     {
       size_t currlen = length;
 
       /* Since checksumming is done after encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming.  However
+       * only do splitting if input is large enough so that last chunks does
+       * not end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
       burn = nburn > burn ? nburn : burn;
 
       nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
 					  currlen);
       burn = nburn > burn ? nburn : burn;
 
       outbuf += currlen;
       inbuf += currlen;
       length -= currlen;
     }
 
   if (burn)
     _gcry_burn_stack (burn);
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 				const byte *inbuf, size_t length)
 {
   CHACHA20_context_t *ctx = (void *) &c->context.c;
   unsigned int nburn, burn = 0;
 
   if (!length)
     return 0;
 
   if (ctx->unused)
     {
       unsigned char *p = ctx->pad;
       size_t n;
 
       gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
 
       n = ctx->unused;
       if (n > length)
         n = length;
 
       nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
       burn = nburn > burn ? nburn : burn;
       buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
       length -= n;
       outbuf += n;
       inbuf += n;
       ctx->unused -= n;
 
       if (!length)
 	{
 	  if (burn)
 	    _gcry_burn_stack (burn);
 
 	  return 0;
 	}
       gcry_assert (!ctx->unused);
     }
 
   gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
 
 #ifdef USE_AVX2
   if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 8;
 
       nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
 			ctx->input, outbuf, inbuf, nblocks,
 			&c->u_mode.poly1305.ctx.state, inbuf);
       burn = nburn > burn ? nburn : burn;
 
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_SSSE3
   if (ctx->use_ssse3)
     {
       if (length >= 4 * CHACHA20_BLOCK_SIZE)
 	{
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	  nblocks -= nblocks % 4;
 
 	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
 			    ctx->input, outbuf, inbuf, nblocks,
 			    &c->u_mode.poly1305.ctx.state, inbuf);
 	  burn = nburn > burn ? nburn : burn;
 
 	  length -= nblocks * CHACHA20_BLOCK_SIZE;
 	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
 	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	}
 
       if (length >= CHACHA20_BLOCK_SIZE)
 	{
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 
 	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
 			    ctx->input, outbuf, inbuf, nblocks,
 			    &c->u_mode.poly1305.ctx.state, inbuf);
 	  burn = nburn > burn ? nburn : burn;
 
 	  length -= nblocks * CHACHA20_BLOCK_SIZE;
 	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
 	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	}
     }
 #endif
 
 #ifdef USE_AARCH64_SIMD
   if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
 
       nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
 			ctx->input, outbuf, inbuf, nblocks,
 			&c->u_mode.poly1305.ctx.state, inbuf);
       burn = nburn > burn ? nburn : burn;
 
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_PPC_VEC_POLY1305
   if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
 
       nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
 			ctx->input, outbuf, inbuf, nblocks,
 			&c->u_mode.poly1305.ctx.state, inbuf);
       burn = nburn > burn ? nburn : burn;
 
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
 #endif
 
 #ifdef USE_S390X_VX_POLY1305
   if (ctx->use_s390x)
     {
       if (length >= 8 * CHACHA20_BLOCK_SIZE)
 	{
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	  nblocks -= nblocks % 8;
 
 	  nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
 			    ctx->input, outbuf, inbuf, nblocks,
 			    &c->u_mode.poly1305.ctx.state, inbuf);
 	  burn = nburn > burn ? nburn : burn;
 
 	  length -= nblocks * CHACHA20_BLOCK_SIZE;
 	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
 	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	}
 
       if (length >= CHACHA20_BLOCK_SIZE)
 	{
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 
 	  nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
 			    ctx->input, outbuf, inbuf, nblocks,
 			    &c->u_mode.poly1305.ctx.state, inbuf);
 	  burn = nburn > burn ? nburn : burn;
 
 	  length -= nblocks * CHACHA20_BLOCK_SIZE;
 	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
 	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
 	}
     }
 #endif
 
   while (length)
     {
       size_t currlen = length;
 
       /* Since checksumming is done before decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for decryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for decryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
 					  currlen);
       burn = nburn > burn ? nburn : burn;
 
       nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
       burn = nburn > burn ? nburn : burn;
 
       outbuf += currlen;
       inbuf += currlen;
       length -= currlen;
     }
 
   if (burn)
     _gcry_burn_stack (burn);
 
   return 0;
 }
 
 
 static const char *
 selftest (void)
 {
   byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
   CHACHA20_context_t *ctx;
   byte scratch[127 + 1];
   byte buf[512 + 64 + 4];
   int i;
 
   /* From draft-strombergson-chacha-test-vectors */
   static byte key_1[] = {
     0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
     0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
     0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
     0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
   };
   static const byte nonce_1[] =
     { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
   static const byte plaintext_1[127] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
   static const byte ciphertext_1[127] = {
     0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
     0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
     0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
     0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
     0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
     0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
     0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
     0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
     0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
     0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
     0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
     0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
     0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
     0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
     0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
     0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
   };
 
   /* 16-byte alignment required for amd64 implementation. */
   ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
 
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   scratch[sizeof (scratch) - 1] = 0;
   chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
   if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
     return "ChaCha20 encryption test 1 failed.";
   if (scratch[sizeof (scratch) - 1])
     return "ChaCha20 wrote too much.";
   chacha20_setkey (ctx, key_1, sizeof (key_1), NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
   if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
     return "ChaCha20 decryption test 1 failed.";
 
   for (i = 0; i < sizeof buf; i++)
     buf[i] = i;
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /*encrypt */
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   /*decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, 1);
   chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
   chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
                            buf + (sizeof buf) - 1, 1);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 2 failed.";
 
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /* encrypt */
   for (i = 0; i < sizeof buf; i++)
     chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
   /* decrypt */
   chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
   chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 3 failed.";
 
   return NULL;
 }
 
 
 gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
   GCRY_CIPHER_CHACHA20,
   {0, 0},                       /* flags */
   "CHACHA20",                   /* name */
   NULL,                         /* aliases */
   NULL,                         /* oids */
   1,                            /* blocksize in bytes. */
   CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
   sizeof (CHACHA20_context_t),
   chacha20_setkey,
   NULL,
   NULL,
   chacha20_encrypt_stream,
   chacha20_encrypt_stream,
   NULL,
   NULL,
   chacha20_setiv
 };
diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c
index dcb268d0..3e2a767a 100644
--- a/cipher/cipher-ccm.c
+++ b/cipher/cipher-ccm.c
@@ -1,415 +1,419 @@
 /* cipher-ccm.c - CTR mode with CBC-MAC mode implementation
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
 
 #define set_burn(burn, nburn) do { \
   unsigned int __nburn = (nburn); \
   (burn) = (burn) > __nburn ? (burn) : __nburn; } while (0)
 
 
 static unsigned int
 do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen,
             int do_padding)
 {
   const unsigned int blocksize = 16;
   gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   unsigned char tmp[blocksize];
   unsigned int burn = 0;
   unsigned int unused = c->u_mode.ccm.mac_unused;
   size_t nblocks;
   size_t n;
 
   if (inlen == 0 && (unused == 0 || !do_padding))
     return 0;
 
   do
     {
       if (inlen + unused < blocksize || unused > 0)
         {
 	  n = (inlen > blocksize - unused) ? blocksize - unused : inlen;
 
 	  buf_cpy (&c->u_mode.ccm.macbuf[unused], inbuf, n);
 	  unused += n;
 	  inlen -= n;
 	  inbuf += n;
         }
       if (!inlen)
         {
           if (!do_padding)
             break;
 
 	  n = blocksize - unused;
 	  if (n > 0)
 	    {
 	      memset (&c->u_mode.ccm.macbuf[unused], 0, n);
 	      unused = blocksize;
 	    }
         }
 
       if (unused > 0)
         {
           /* Process one block from macbuf.  */
           cipher_block_xor(c->u_iv.iv, c->u_iv.iv, c->u_mode.ccm.macbuf,
                            blocksize);
           set_burn (burn, enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv ));
 
           unused = 0;
         }
 
       if (c->bulk.cbc_enc)
         {
           nblocks = inlen / blocksize;
           c->bulk.cbc_enc (&c->context.c, c->u_iv.iv, tmp, inbuf, nblocks, 1);
           inbuf += nblocks * blocksize;
           inlen -= nblocks * blocksize;
 
           wipememory (tmp, sizeof(tmp));
         }
       else
         {
           while (inlen >= blocksize)
             {
               cipher_block_xor(c->u_iv.iv, c->u_iv.iv, inbuf, blocksize);
 
               set_burn (burn, enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv ));
 
               inlen -= blocksize;
               inbuf += blocksize;
             }
         }
     }
   while (inlen > 0);
 
   c->u_mode.ccm.mac_unused = unused;
 
   if (burn)
     burn += 4 * sizeof(void *);
 
   return burn;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce,
                             size_t noncelen)
 {
   unsigned int marks_key;
   size_t L = 15 - noncelen;
   size_t L_;
 
   L_ = L - 1;
 
   if (!nonce)
     return GPG_ERR_INV_ARG;
   /* Length field must be 2, 3, ..., or 8. */
   if (L < 2 || L > 8)
     return GPG_ERR_INV_LENGTH;
 
   /* Reset state */
   marks_key = c->marks.key;
   memset (&c->u_mode, 0, sizeof(c->u_mode));
   memset (&c->marks, 0, sizeof(c->marks));
   memset (&c->u_iv, 0, sizeof(c->u_iv));
   memset (&c->u_ctr, 0, sizeof(c->u_ctr));
   memset (c->lastiv, 0, sizeof(c->lastiv));
   c->unused = 0;
   c->marks.key = marks_key;
 
   /* Setup CTR */
   c->u_ctr.ctr[0] = L_;
   memcpy (&c->u_ctr.ctr[1], nonce, noncelen);
   memset (&c->u_ctr.ctr[1 + noncelen], 0, L);
 
   /* Setup IV */
   c->u_iv.iv[0] = L_;
   memcpy (&c->u_iv.iv[1], nonce, noncelen);
   /* Add (8 * M_ + 64 * flags) to iv[0] and set iv[noncelen + 1 ... 15] later
      in set_aad.  */
   memset (&c->u_iv.iv[1 + noncelen], 0, L);
 
   c->u_mode.ccm.nonce = 1;
 
   return GPG_ERR_NO_ERROR;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, u64 encryptlen, u64 aadlen,
                               u64 taglen)
 {
   unsigned int burn = 0;
   unsigned char b0[16];
   size_t noncelen = 15 - (c->u_iv.iv[0] + 1);
   u64 M = taglen;
   u64 M_;
   int i;
 
   M_ = (M - 2) / 2;
 
   /* Authentication field must be 4, 6, 8, 10, 12, 14 or 16. */
   if ((M_ * 2 + 2) != M || M < 4 || M > 16)
     return GPG_ERR_INV_LENGTH;
   if (!c->u_mode.ccm.nonce || c->marks.tag)
     return GPG_ERR_INV_STATE;
   if (c->u_mode.ccm.lengths)
     return GPG_ERR_INV_STATE;
 
   c->u_mode.ccm.authlen = taglen;
   c->u_mode.ccm.encryptlen = encryptlen;
   c->u_mode.ccm.aadlen = aadlen;
 
   /* Complete IV setup.  */
   c->u_iv.iv[0] += (aadlen > 0) * 64 + M_ * 8;
   for (i = 16 - 1; i >= 1 + noncelen; i--)
     {
       c->u_iv.iv[i] = encryptlen & 0xff;
       encryptlen >>= 8;
     }
 
   memcpy (b0, c->u_iv.iv, 16);
   memset (c->u_iv.iv, 0, 16);
 
   set_burn (burn, do_cbc_mac (c, b0, 16, 0));
 
   if (aadlen == 0)
     {
       /* Do nothing.  */
     }
   else if (aadlen > 0 && aadlen <= (unsigned int)0xfeff)
     {
       b0[0] = (aadlen >> 8) & 0xff;
       b0[1] = aadlen & 0xff;
       set_burn (burn, do_cbc_mac (c, b0, 2, 0));
     }
   else if (aadlen > 0xfeff && aadlen <= (unsigned int)0xffffffff)
     {
       b0[0] = 0xff;
       b0[1] = 0xfe;
       buf_put_be32(&b0[2], aadlen);
       set_burn (burn, do_cbc_mac (c, b0, 6, 0));
     }
   else if (aadlen > (unsigned int)0xffffffff)
     {
       b0[0] = 0xff;
       b0[1] = 0xff;
       buf_put_be64(&b0[2], aadlen);
       set_burn (burn, do_cbc_mac (c, b0, 10, 0));
     }
 
   /* Generate S_0 and increase counter.  */
   set_burn (burn, c->spec->encrypt ( &c->context.c, c->u_mode.ccm.s0,
                                      c->u_ctr.ctr ));
   c->u_ctr.ctr[15]++;
 
   if (burn)
     _gcry_burn_stack (burn + sizeof(void *) * 5);
 
   c->u_mode.ccm.lengths = 1;
 
   return GPG_ERR_NO_ERROR;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
                                size_t abuflen)
 {
   unsigned int burn;
 
   if (abuflen > 0 && !abuf)
     return GPG_ERR_INV_ARG;
   if (!c->u_mode.ccm.nonce || !c->u_mode.ccm.lengths || c->marks.tag)
     return GPG_ERR_INV_STATE;
   if (abuflen > c->u_mode.ccm.aadlen)
     return GPG_ERR_INV_LENGTH;
 
   c->u_mode.ccm.aadlen -= abuflen;
   burn = do_cbc_mac (c, abuf, abuflen, c->u_mode.ccm.aadlen == 0);
 
   if (burn)
     _gcry_burn_stack (burn + sizeof(void *) * 5);
 
   return GPG_ERR_NO_ERROR;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_tag (gcry_cipher_hd_t c, unsigned char *outbuf,
 		      size_t outbuflen, int check)
 {
   unsigned int burn;
 
   if (!outbuf || outbuflen == 0)
     return GPG_ERR_INV_ARG;
   /* Tag length must be same as initial authlen.  */
   if (c->u_mode.ccm.authlen != outbuflen)
     return GPG_ERR_INV_LENGTH;
   if (!c->u_mode.ccm.nonce || !c->u_mode.ccm.lengths || c->u_mode.ccm.aadlen > 0)
     return GPG_ERR_INV_STATE;
   /* Initial encrypt length must match with length of actual data processed.  */
   if (c->u_mode.ccm.encryptlen > 0)
     return GPG_ERR_UNFINISHED;
 
   if (!c->marks.tag)
     {
       burn = do_cbc_mac (c, NULL, 0, 1); /* Perform final padding.  */
 
       /* Add S_0 */
       cipher_block_xor (c->u_iv.iv, c->u_iv.iv, c->u_mode.ccm.s0, 16);
 
       wipememory (c->u_ctr.ctr, 16);
       wipememory (c->u_mode.ccm.s0, 16);
       wipememory (c->u_mode.ccm.macbuf, 16);
 
       if (burn)
         _gcry_burn_stack (burn + sizeof(void *) * 5);
 
       c->marks.tag = 1;
     }
 
   if (!check)
     {
       memcpy (outbuf, c->u_iv.iv, outbuflen);
       return GPG_ERR_NO_ERROR;
     }
   else
     {
       return buf_eq_const(outbuf, c->u_iv.iv, outbuflen) ?
              GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
     }
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
 			  size_t taglen)
 {
   return _gcry_cipher_ccm_tag (c, outtag, taglen, 0);
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
 			    size_t taglen)
 {
   return _gcry_cipher_ccm_tag (c, (unsigned char *)intag, taglen, 1);
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
                           size_t outbuflen, const unsigned char *inbuf,
                           size_t inbuflen)
 {
   gcry_err_code_t err = 0;
   unsigned int burn = 0;
   unsigned int nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (!c->u_mode.ccm.nonce || c->marks.tag || !c->u_mode.ccm.lengths ||
       c->u_mode.ccm.aadlen > 0)
     return GPG_ERR_INV_STATE;
   if (inbuflen > c->u_mode.ccm.encryptlen)
     return GPG_ERR_INV_LENGTH;
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done before encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for encryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for encryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       c->u_mode.ccm.encryptlen -= currlen;
       nburn = do_cbc_mac (c, inbuf, currlen, 0);
       burn = nburn > burn ? nburn : burn;
 
       err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
       if (err)
 	break;
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   if (burn)
     _gcry_burn_stack (burn + sizeof(void *) * 5);
   return err;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
                           size_t outbuflen, const unsigned char *inbuf,
                           size_t inbuflen)
 {
   gcry_err_code_t err = 0;
   unsigned int burn = 0;
   unsigned int nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (!c->u_mode.ccm.nonce || c->marks.tag || !c->u_mode.ccm.lengths ||
       c->u_mode.ccm.aadlen > 0)
     return GPG_ERR_INV_STATE;
   if (inbuflen > c->u_mode.ccm.encryptlen)
     return GPG_ERR_INV_LENGTH;
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done after decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming.  However
+       * only do splitting if input is large enough so that last chunks
+       * does not end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
       if (err)
 	break;
 
       c->u_mode.ccm.encryptlen -= currlen;
       nburn = do_cbc_mac (c, outbuf, currlen, 0);
       burn = nburn > burn ? nburn : burn;
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   if (burn)
     _gcry_burn_stack (burn + sizeof(void *) * 5);
   return err;
 }
diff --git a/cipher/cipher-eax.c b/cipher/cipher-eax.c
index 08f815a9..0c5cf84e 100644
--- a/cipher/cipher-eax.c
+++ b/cipher/cipher-eax.c
@@ -1,289 +1,293 @@
 /* cipher-eax.c  -  EAX implementation
  * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
 
 gcry_err_code_t
 _gcry_cipher_eax_encrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     {
       err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
       if (err != 0)
 	return err;
     }
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done after encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming. However
+       * only do splitting if input is large enough so that last chunks does
+       * not end up being short.*/
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
       if (err != 0)
 	return err;
 
       err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf,
 			      currlen);
       if (err != 0)
 	return err;
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_eax_decrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     {
       err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
       if (err != 0)
 	return err;
     }
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done before decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for decryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for decryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf,
 			      currlen);
       if (err != 0)
 	return err;
 
       err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
       if (err != 0)
 	return err;
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_eax_authenticate (gcry_cipher_hd_t c,
                                const byte * aadbuf, size_t aadbuflen)
 {
   gcry_err_code_t err;
 
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     {
       err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
       if (err != 0)
 	return err;
     }
 
   return _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, aadbuf, aadbuflen);
 }
 
 
 gcry_err_code_t
 _gcry_cipher_eax_setkey (gcry_cipher_hd_t c)
 {
   gcry_err_code_t err;
 
   err = _gcry_cmac_generate_subkeys (c, &c->u_mode.eax.cmac_header);
   if (err != 0)
     return err;
 
   buf_cpy (c->u_mode.eax.cmac_ciphertext.subkeys,
 	   c->u_mode.eax.cmac_header.subkeys,
 	   sizeof(c->u_mode.eax.cmac_header.subkeys));
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_eax_set_nonce (gcry_cipher_hd_t c, const byte *nonce,
 			    size_t noncelen)
 {
   gcry_cmac_context_t nonce_cmac;
   unsigned char initbuf[MAX_BLOCKSIZE];
   gcry_err_code_t err;
 
   c->marks.iv = 0;
   c->marks.tag = 0;
 
   _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
   _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
 
   /* Calculate nonce CMAC */
 
   memset(&nonce_cmac, 0, sizeof(nonce_cmac));
   memset(&initbuf, 0, sizeof(initbuf));
 
   buf_cpy (&nonce_cmac.subkeys, c->u_mode.eax.cmac_header.subkeys,
 	   sizeof(c->u_mode.eax.cmac_header.subkeys));
 
   err = _gcry_cmac_write (c, &nonce_cmac, initbuf, c->spec->blocksize);
   if (err != 0)
     return err;
 
   if (noncelen != 0)
     {
       err = _gcry_cmac_write (c, &nonce_cmac, nonce, noncelen);
       if (err != 0)
         return err;
     }
 
   err = _gcry_cmac_final (c, &nonce_cmac);
   if (err != 0)
     return err;
 
   cipher_block_cpy (c->u_iv.iv, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
   cipher_block_cpy (c->u_ctr.ctr, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
 
   wipememory (&nonce_cmac, sizeof(nonce_cmac));
 
   /* Prepare header CMAC */
 
   initbuf[c->spec->blocksize - 1] = 1;
   err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, initbuf,
 			  c->spec->blocksize);
   if (err != 0)
     return err;
 
   /* Prepare ciphertext CMAC */
 
   initbuf[c->spec->blocksize - 1] = 2;
   err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, initbuf,
 			  c->spec->blocksize);
   if (err != 0)
     return err;
 
   c->marks.iv = 1;
   c->marks.tag = 0;
 
   return 0;
 }
 
 
 static gcry_err_code_t
 _gcry_cipher_eax_tag (gcry_cipher_hd_t c,
                       byte *outbuf, size_t outbuflen, int check)
 {
   gcry_err_code_t err;
 
   if (!c->marks.tag)
     {
       err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_header);
       if (err != 0)
 	return err;
 
       err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_ciphertext);
       if (err != 0)
 	return err;
 
       cipher_block_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_header.u_iv.iv,
                           MAX_BLOCKSIZE);
       cipher_block_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_ciphertext.u_iv.iv,
                           MAX_BLOCKSIZE);
 
       _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
       _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
 
       c->marks.tag = 1;
     }
 
   if (!check)
     {
       if (outbuflen > c->spec->blocksize)
         outbuflen = c->spec->blocksize;
 
       /* NB: We already checked that OUTBUF is large enough to hold
        * the result or has valid truncated length.  */
       memcpy (outbuf, c->u_iv.iv, outbuflen);
     }
   else
     {
       /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
        * and thus we need to compare its length first.  */
       if (!(outbuflen <= c->spec->blocksize)
           || !buf_eq_const (outbuf, c->u_iv.iv, outbuflen))
         return GPG_ERR_CHECKSUM;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_eax_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
                           size_t taglen)
 {
   return _gcry_cipher_eax_tag (c, outtag, taglen, 0);
 }
 
 gcry_err_code_t
 _gcry_cipher_eax_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
                             size_t taglen)
 {
   return _gcry_cipher_eax_tag (c, (unsigned char *) intag, taglen, 1);
 }
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index fc79986e..69ff0de6 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -1,1263 +1,1264 @@
 /* cipher-gcm.c  - Generic Galois Counter Mode implementation
  * Copyright (C) 2013 Dmitry Eremin-Solenikov
  * Copyright (C) 2013, 2018-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
 
 /* Helper macro to force alignment to 16 or 64 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
 # define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
 #else
 # define ATTR_ALIGNED_64
 #endif
 
 
 #ifdef GCM_USE_INTEL_PCLMUL
 extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
 
 extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
                                               const byte *buf, size_t nblocks);
 
 extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c,
                                                 byte *result,
                                                 const byte *buf,
                                                 size_t nblocks);
 #endif
 
 #ifdef GCM_USE_ARM_PMULL
 extern void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
 
 extern unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
                                                 const byte *buf, size_t nblocks,
                                                 void *gcm_table);
 
 extern unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result,
                                                   const byte *buf,
                                                   size_t nblocks,
                                                   void *gcm_table);
 
 static void
 ghash_setup_armv8_ce_pmull (gcry_cipher_hd_t c)
 {
   _gcry_ghash_setup_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key,
                                    c->u_mode.gcm.gcm_table);
 }
 
 static unsigned int
 ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
                       size_t nblocks)
 {
   return _gcry_ghash_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result, buf,
                                     nblocks, c->u_mode.gcm.gcm_table);
 }
 
 static unsigned int
 polyval_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
                         size_t nblocks)
 {
   return _gcry_polyval_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result,
                                       buf, nblocks, c->u_mode.gcm.gcm_table);
 }
 #endif /* GCM_USE_ARM_PMULL */
 
 #ifdef GCM_USE_ARM_NEON
 extern void _gcry_ghash_setup_armv7_neon (void *gcm_key);
 
 extern unsigned int _gcry_ghash_armv7_neon (void *gcm_key, byte *result,
 					    const byte *buf, size_t nblocks);
 
 static void
 ghash_setup_armv7_neon (gcry_cipher_hd_t c)
 {
   _gcry_ghash_setup_armv7_neon(c->u_mode.gcm.u_ghash_key.key);
 }
 
 static unsigned int
 ghash_armv7_neon (gcry_cipher_hd_t c, byte *result, const byte *buf,
 		  size_t nblocks)
 {
   return _gcry_ghash_armv7_neon(c->u_mode.gcm.u_ghash_key.key, result, buf,
 				nblocks);
 }
 #endif /* GCM_USE_ARM_NEON */
 
 #ifdef GCM_USE_S390X_CRYPTO
 #include "asm-inline-s390x.h"
 
 static unsigned int
 ghash_s390x_kimd (gcry_cipher_hd_t c, byte *result, const byte *buf,
 		  size_t nblocks)
 {
   u128_t params[2];
 
   memcpy (&params[0], result, 16);
   memcpy (&params[1], c->u_mode.gcm.u_ghash_key.key, 16);
 
   kimd_execute (KMID_FUNCTION_GHASH, &params, buf, nblocks * 16);
 
   memcpy (result, &params[0], 16);
   wipememory (params, sizeof(params));
   return 0;
 }
 #endif /* GCM_USE_S390X_CRYPTO*/
 
 #ifdef GCM_USE_PPC_VPMSUM
 extern void _gcry_ghash_setup_ppc_vpmsum (void *gcm_table, void *gcm_key);
 
 /* result is 128-bits */
 extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
 					    const byte *buf, size_t nblocks);
 
 static void
 ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c)
 {
   _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table,
 			       c->u_mode.gcm.u_ghash_key.key);
 }
 
 static unsigned int
 ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf,
 		  size_t nblocks)
 {
   return _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf,
 				nblocks);
 }
 #endif /* GCM_USE_PPC_VPMSUM */
 
 #ifdef GCM_USE_TABLES
 static struct
 {
   volatile u32 counter_head;
   u32 cacheline_align[64 / 4 - 1];
   u16 R[256];
   volatile u32 counter_tail;
 } gcm_table ATTR_ALIGNED_64 =
   {
     0,
     { 0, },
     {
       0x0000, 0x01c2, 0x0384, 0x0246, 0x0708, 0x06ca, 0x048c, 0x054e,
       0x0e10, 0x0fd2, 0x0d94, 0x0c56, 0x0918, 0x08da, 0x0a9c, 0x0b5e,
       0x1c20, 0x1de2, 0x1fa4, 0x1e66, 0x1b28, 0x1aea, 0x18ac, 0x196e,
       0x1230, 0x13f2, 0x11b4, 0x1076, 0x1538, 0x14fa, 0x16bc, 0x177e,
       0x3840, 0x3982, 0x3bc4, 0x3a06, 0x3f48, 0x3e8a, 0x3ccc, 0x3d0e,
       0x3650, 0x3792, 0x35d4, 0x3416, 0x3158, 0x309a, 0x32dc, 0x331e,
       0x2460, 0x25a2, 0x27e4, 0x2626, 0x2368, 0x22aa, 0x20ec, 0x212e,
       0x2a70, 0x2bb2, 0x29f4, 0x2836, 0x2d78, 0x2cba, 0x2efc, 0x2f3e,
       0x7080, 0x7142, 0x7304, 0x72c6, 0x7788, 0x764a, 0x740c, 0x75ce,
       0x7e90, 0x7f52, 0x7d14, 0x7cd6, 0x7998, 0x785a, 0x7a1c, 0x7bde,
       0x6ca0, 0x6d62, 0x6f24, 0x6ee6, 0x6ba8, 0x6a6a, 0x682c, 0x69ee,
       0x62b0, 0x6372, 0x6134, 0x60f6, 0x65b8, 0x647a, 0x663c, 0x67fe,
       0x48c0, 0x4902, 0x4b44, 0x4a86, 0x4fc8, 0x4e0a, 0x4c4c, 0x4d8e,
       0x46d0, 0x4712, 0x4554, 0x4496, 0x41d8, 0x401a, 0x425c, 0x439e,
       0x54e0, 0x5522, 0x5764, 0x56a6, 0x53e8, 0x522a, 0x506c, 0x51ae,
       0x5af0, 0x5b32, 0x5974, 0x58b6, 0x5df8, 0x5c3a, 0x5e7c, 0x5fbe,
       0xe100, 0xe0c2, 0xe284, 0xe346, 0xe608, 0xe7ca, 0xe58c, 0xe44e,
       0xef10, 0xeed2, 0xec94, 0xed56, 0xe818, 0xe9da, 0xeb9c, 0xea5e,
       0xfd20, 0xfce2, 0xfea4, 0xff66, 0xfa28, 0xfbea, 0xf9ac, 0xf86e,
       0xf330, 0xf2f2, 0xf0b4, 0xf176, 0xf438, 0xf5fa, 0xf7bc, 0xf67e,
       0xd940, 0xd882, 0xdac4, 0xdb06, 0xde48, 0xdf8a, 0xddcc, 0xdc0e,
       0xd750, 0xd692, 0xd4d4, 0xd516, 0xd058, 0xd19a, 0xd3dc, 0xd21e,
       0xc560, 0xc4a2, 0xc6e4, 0xc726, 0xc268, 0xc3aa, 0xc1ec, 0xc02e,
       0xcb70, 0xcab2, 0xc8f4, 0xc936, 0xcc78, 0xcdba, 0xcffc, 0xce3e,
       0x9180, 0x9042, 0x9204, 0x93c6, 0x9688, 0x974a, 0x950c, 0x94ce,
       0x9f90, 0x9e52, 0x9c14, 0x9dd6, 0x9898, 0x995a, 0x9b1c, 0x9ade,
       0x8da0, 0x8c62, 0x8e24, 0x8fe6, 0x8aa8, 0x8b6a, 0x892c, 0x88ee,
       0x83b0, 0x8272, 0x8034, 0x81f6, 0x84b8, 0x857a, 0x873c, 0x86fe,
       0xa9c0, 0xa802, 0xaa44, 0xab86, 0xaec8, 0xaf0a, 0xad4c, 0xac8e,
       0xa7d0, 0xa612, 0xa454, 0xa596, 0xa0d8, 0xa11a, 0xa35c, 0xa29e,
       0xb5e0, 0xb422, 0xb664, 0xb7a6, 0xb2e8, 0xb32a, 0xb16c, 0xb0ae,
       0xbbf0, 0xba32, 0xb874, 0xb9b6, 0xbcf8, 0xbd3a, 0xbf7c, 0xbebe,
     },
     0
   };
 
 #define gcmR gcm_table.R
 
 static inline
 void prefetch_table(const void *tab, size_t len)
 {
   const volatile byte *vtab = tab;
   size_t i;
 
   for (i = 0; len - i >= 8 * 32; i += 8 * 32)
     {
       (void)vtab[i + 0 * 32];
       (void)vtab[i + 1 * 32];
       (void)vtab[i + 2 * 32];
       (void)vtab[i + 3 * 32];
       (void)vtab[i + 4 * 32];
       (void)vtab[i + 5 * 32];
       (void)vtab[i + 6 * 32];
       (void)vtab[i + 7 * 32];
     }
   for (; i < len; i += 32)
     {
       (void)vtab[i];
     }
 
   (void)vtab[len - 1];
 }
 
 static inline void
 do_prefetch_tables (const void *gcmM, size_t gcmM_size)
 {
   /* Modify counters to trigger copy-on-write and unsharing if physical pages
    * of look-up table are shared between processes.  Modifying counters also
    * causes checksums for pages to change and hint same-page merging algorithm
    * that these pages are frequently changing.  */
   gcm_table.counter_head++;
   gcm_table.counter_tail++;
 
   /* Prefetch look-up tables to cache.  */
   prefetch_table(gcmM, gcmM_size);
   prefetch_table(&gcm_table, sizeof(gcm_table));
 }
 
 #ifdef GCM_TABLES_USE_U64
 static void
 bshift (u64 * b0, u64 * b1)
 {
   u64 t[2], mask;
 
   t[0] = *b0;
   t[1] = *b1;
   mask = -(t[1] & 1) & 0xe1;
   mask <<= 56;
 
   *b1 = (t[1] >> 1) ^ (t[0] << 63);
   *b0 = (t[0] >> 1) ^ mask;
 }
 
 static void
 do_fillM (unsigned char *h, u64 *M)
 {
   int i, j;
 
   M[0 + 0] = 0;
   M[0 + 16] = 0;
 
   M[8 + 0] = buf_get_be64 (h + 0);
   M[8 + 16] = buf_get_be64 (h + 8);
 
   for (i = 4; i > 0; i /= 2)
     {
       M[i + 0] = M[2 * i + 0];
       M[i + 16] = M[2 * i + 16];
 
       bshift (&M[i], &M[i + 16]);
     }
 
   for (i = 2; i < 16; i *= 2)
     for (j = 1; j < i; j++)
       {
         M[(i + j) + 0] = M[i + 0] ^ M[j + 0];
         M[(i + j) + 16] = M[i + 16] ^ M[j + 16];
       }
 
   for (i = 0; i < 16; i++)
     {
       M[i + 32] = (M[i + 0] >> 4) ^ ((u64) gcmR[(M[i + 16] & 0xf) << 4] << 48);
       M[i + 48] = (M[i + 16] >> 4) ^ (M[i + 0] << 60);
     }
 }
 
 static inline unsigned int
 do_ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM)
 {
   u64 V[2];
   u64 tmp[2];
   const u64 *M;
   u64 T;
   u32 A;
   int i;
 
   cipher_block_xor (V, result, buf, 16);
   V[0] = be_bswap64 (V[0]);
   V[1] = be_bswap64 (V[1]);
 
   /* First round can be manually tweaked based on fact that 'tmp' is zero. */
   M = &gcmM[(V[1] & 0xf) + 32];
   V[1] >>= 4;
   tmp[0] = M[0];
   tmp[1] = M[16];
   tmp[0] ^= gcmM[(V[1] & 0xf) + 0];
   tmp[1] ^= gcmM[(V[1] & 0xf) + 16];
   V[1] >>= 4;
 
   i = 6;
   while (1)
     {
       M = &gcmM[(V[1] & 0xf) + 32];
       V[1] >>= 4;
 
       A = tmp[1] & 0xff;
       T = tmp[0];
       tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[1] & 0xf) + 0];
       tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[1] & 0xf) + 16];
 
       tmp[0] ^= M[0];
       tmp[1] ^= M[16];
 
       if (i == 0)
         break;
 
       V[1] >>= 4;
       --i;
     }
 
   i = 7;
   while (1)
     {
       M = &gcmM[(V[0] & 0xf) + 32];
       V[0] >>= 4;
 
       A = tmp[1] & 0xff;
       T = tmp[0];
       tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[0] & 0xf) + 0];
       tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[0] & 0xf) + 16];
 
       tmp[0] ^= M[0];
       tmp[1] ^= M[16];
 
       if (i == 0)
         break;
 
       V[0] >>= 4;
       --i;
     }
 
   buf_put_be64 (result + 0, tmp[0]);
   buf_put_be64 (result + 8, tmp[1]);
 
   return (sizeof(V) + sizeof(T) + sizeof(tmp) +
           sizeof(int)*2 + sizeof(void*)*5);
 }
 
 #else /*!GCM_TABLES_USE_U64*/
 
 static void
 bshift (u32 * M, int i)
 {
   u32 t[4], mask;
 
   t[0] = M[i * 4 + 0];
   t[1] = M[i * 4 + 1];
   t[2] = M[i * 4 + 2];
   t[3] = M[i * 4 + 3];
   mask = -(t[3] & 1) & 0xe1;
 
   M[i * 4 + 3] = (t[3] >> 1) ^ (t[2] << 31);
   M[i * 4 + 2] = (t[2] >> 1) ^ (t[1] << 31);
   M[i * 4 + 1] = (t[1] >> 1) ^ (t[0] << 31);
   M[i * 4 + 0] = (t[0] >> 1) ^ (mask << 24);
 }
 
 static void
 do_fillM (unsigned char *h, u32 *M)
 {
   int i, j;
 
   M[0 * 4 + 0] = 0;
   M[0 * 4 + 1] = 0;
   M[0 * 4 + 2] = 0;
   M[0 * 4 + 3] = 0;
 
   M[8 * 4 + 0] = buf_get_be32 (h + 0);
   M[8 * 4 + 1] = buf_get_be32 (h + 4);
   M[8 * 4 + 2] = buf_get_be32 (h + 8);
   M[8 * 4 + 3] = buf_get_be32 (h + 12);
 
   for (i = 4; i > 0; i /= 2)
     {
       M[i * 4 + 0] = M[2 * i * 4 + 0];
       M[i * 4 + 1] = M[2 * i * 4 + 1];
       M[i * 4 + 2] = M[2 * i * 4 + 2];
       M[i * 4 + 3] = M[2 * i * 4 + 3];
 
       bshift (M, i);
     }
 
   for (i = 2; i < 16; i *= 2)
     for (j = 1; j < i; j++)
       {
         M[(i + j) * 4 + 0] = M[i * 4 + 0] ^ M[j * 4 + 0];
         M[(i + j) * 4 + 1] = M[i * 4 + 1] ^ M[j * 4 + 1];
         M[(i + j) * 4 + 2] = M[i * 4 + 2] ^ M[j * 4 + 2];
         M[(i + j) * 4 + 3] = M[i * 4 + 3] ^ M[j * 4 + 3];
       }
 
   for (i = 0; i < 4 * 16; i += 4)
     {
       M[i + 0 + 64] = (M[i + 0] >> 4)
                       ^ ((u64) gcmR[(M[i + 3] << 4) & 0xf0] << 16);
       M[i + 1 + 64] = (M[i + 1] >> 4) ^ (M[i + 0] << 28);
       M[i + 2 + 64] = (M[i + 2] >> 4) ^ (M[i + 1] << 28);
       M[i + 3 + 64] = (M[i + 3] >> 4) ^ (M[i + 2] << 28);
     }
 }
 
 static inline unsigned int
 do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM)
 {
   byte V[16];
   u32 tmp[4];
   u32 v;
   const u32 *M, *m;
   u32 T[3];
   int i;
 
   cipher_block_xor (V, result, buf, 16); /* V is big-endian */
 
   /* First round can be manually tweaked based on fact that 'tmp' is zero. */
   i = 15;
 
   v = V[i];
   M = &gcmM[(v & 0xf) * 4 + 64];
   v = (v & 0xf0) >> 4;
   m = &gcmM[v * 4];
   v = V[--i];
 
   tmp[0] = M[0] ^ m[0];
   tmp[1] = M[1] ^ m[1];
   tmp[2] = M[2] ^ m[2];
   tmp[3] = M[3] ^ m[3];
 
   while (1)
     {
       M = &gcmM[(v & 0xf) * 4 + 64];
       v = (v & 0xf0) >> 4;
       m = &gcmM[v * 4];
 
       T[0] = tmp[0];
       T[1] = tmp[1];
       T[2] = tmp[2];
       tmp[0] = (T[0] >> 8) ^ ((u32) gcmR[tmp[3] & 0xff] << 16) ^ m[0];
       tmp[1] = (T[0] << 24) ^ (tmp[1] >> 8) ^ m[1];
       tmp[2] = (T[1] << 24) ^ (tmp[2] >> 8) ^ m[2];
       tmp[3] = (T[2] << 24) ^ (tmp[3] >> 8) ^ m[3];
 
       tmp[0] ^= M[0];
       tmp[1] ^= M[1];
       tmp[2] ^= M[2];
       tmp[3] ^= M[3];
 
       if (i == 0)
         break;
 
       v = V[--i];
     }
 
   buf_put_be32 (result + 0, tmp[0]);
   buf_put_be32 (result + 4, tmp[1]);
   buf_put_be32 (result + 8, tmp[2]);
   buf_put_be32 (result + 12, tmp[3]);
 
   return (sizeof(V) + sizeof(T) + sizeof(tmp) +
           sizeof(int)*2 + sizeof(void*)*6);
 }
 #endif /*!GCM_TABLES_USE_U64*/
 
 #define fillM(c) \
   do_fillM (c->u_mode.gcm.u_ghash_key.key, c->u_mode.gcm.gcm_table)
 #define GHASH(c, result, buf) do_ghash (result, buf, c->u_mode.gcm.gcm_table)
 #define prefetch_tables(c) \
   do_prefetch_tables(c->u_mode.gcm.gcm_table, sizeof(c->u_mode.gcm.gcm_table))
 
 #else
 
 static unsigned long
 bshift (unsigned long *b)
 {
   unsigned long c;
   int i;
   c = b[3] & 1;
   for (i = 3; i > 0; i--)
     {
       b[i] = (b[i] >> 1) | (b[i - 1] << 31);
     }
   b[i] >>= 1;
   return c;
 }
 
 static unsigned int
 do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
 {
   unsigned long V[4];
   int i, j;
   byte *p;
 
 #ifdef WORDS_BIGENDIAN
   p = result;
 #else
   unsigned long T[4];
 
   cipher_block_xor (V, result, buf, 16);
   for (i = 0; i < 4; i++)
     {
       V[i] = (V[i] & 0x00ff00ff) << 8 | (V[i] & 0xff00ff00) >> 8;
       V[i] = (V[i] & 0x0000ffff) << 16 | (V[i] & 0xffff0000) >> 16;
     }
   p = (byte *) T;
 #endif
 
   memset (p, 0, 16);
 
   for (i = 0; i < 16; i++)
     {
       for (j = 0x80; j; j >>= 1)
         {
           if (hsub[i] & j)
             cipher_block_xor (p, p, V, 16);
           if (bshift (V))
             V[0] ^= 0xe1000000;
         }
     }
 #ifndef WORDS_BIGENDIAN
   for (i = 0, p = (byte *) T; i < 16; i += 4, p += 4)
     {
       result[i + 0] = p[3];
       result[i + 1] = p[2];
       result[i + 2] = p[1];
       result[i + 3] = p[0];
     }
 #endif
 
   return (sizeof(V) + sizeof(T) + sizeof(int)*2 + sizeof(void*)*5);
 }
 
 #define fillM(c) do { } while (0)
 #define GHASH(c, result, buf) do_ghash (c->u_mode.gcm.u_ghash_key.key, result, buf)
 #define prefetch_tables(c) do {} while (0)
 
 #endif /* !GCM_USE_TABLES */
 
 
 static unsigned int
 ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 size_t nblocks)
 {
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   unsigned int burn = 0;
 
   prefetch_tables (c);
 
   while (nblocks)
     {
       burn = GHASH (c, result, buf);
       buf += blocksize;
       nblocks--;
     }
 
   return burn + (burn ? 5*sizeof(void*) : 0);
 }
 
 
 static void
 setupM (gcry_cipher_hd_t c)
 {
   unsigned int features = _gcry_get_hw_features ();
 
   c->u_mode.gcm.ghash_fn = NULL;
   c->u_mode.gcm.polyval_fn = NULL;
 
   if (0)
     {
       (void)features;
     }
 #ifdef GCM_USE_INTEL_PCLMUL
   else if (features & HWF_INTEL_PCLMUL)
     {
       c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
       c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
       _gcry_ghash_setup_intel_pclmul (c);
     }
 #endif
 #ifdef GCM_USE_ARM_PMULL
   else if (features & HWF_ARM_PMULL)
     {
       c->u_mode.gcm.ghash_fn = ghash_armv8_ce_pmull;
       c->u_mode.gcm.polyval_fn = polyval_armv8_ce_pmull;
       ghash_setup_armv8_ce_pmull (c);
     }
 #endif
 #ifdef GCM_USE_ARM_NEON
   else if (features & HWF_ARM_NEON)
     {
       c->u_mode.gcm.ghash_fn = ghash_armv7_neon;
       ghash_setup_armv7_neon (c);
     }
 #endif
 #ifdef GCM_USE_PPC_VPMSUM
   else if (features & HWF_PPC_VCRYPTO)
     {
       c->u_mode.gcm.ghash_fn = ghash_ppc_vpmsum;
       ghash_setup_ppc_vpmsum (c);
     }
 #endif
 #ifdef GCM_USE_S390X_CRYPTO
   else if (features & HWF_S390X_MSA)
     {
       if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH))
 	{
 	  c->u_mode.gcm.ghash_fn = ghash_s390x_kimd;
 	}
     }
 #endif
 
   if (c->u_mode.gcm.ghash_fn == NULL)
     {
       c->u_mode.gcm.ghash_fn = ghash_internal;
       fillM (c);
     }
 }
 
 
 static inline void
 gcm_bytecounter_add (u32 ctr[2], size_t add)
 {
   if (sizeof(add) > sizeof(u32))
     {
       u32 high_add = ((add >> 31) >> 1) & 0xffffffff;
       ctr[1] += high_add;
     }
 
   ctr[0] += add;
   if (ctr[0] >= add)
     return;
   ++ctr[1];
 }
 
 
 static inline u32
 gcm_add32_be128 (byte *ctr, unsigned int add)
 {
   /* 'ctr' must be aligned to four bytes. */
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   u32 *pval = (u32 *)(void *)(ctr + blocksize - sizeof(u32));
   u32 val;
 
   val = be_bswap32(*pval) + add;
   *pval = be_bswap32(val);
 
   return val; /* return result as host-endian value */
 }
 
 
 static inline int
 gcm_check_datalen (u32 ctr[2])
 {
   /* len(plaintext) <= 2^39-256 bits == 2^36-32 bytes == 2^32-2 blocks */
   if (ctr[1] > 0xfU)
     return 0;
   if (ctr[1] < 0xfU)
     return 1;
 
   if (ctr[0] <= 0xffffffe0U)
     return 1;
 
   return 0;
 }
 
 
 static inline int
 gcm_check_aadlen_or_ivlen (u32 ctr[2])
 {
   /* len(aad/iv) <= 2^64-1 bits ~= 2^61-1 bytes */
   if (ctr[1] > 0x1fffffffU)
     return 0;
   if (ctr[1] < 0x1fffffffU)
     return 1;
 
   if (ctr[0] <= 0xffffffffU)
     return 1;
 
   return 0;
 }
 
 
 static void
 do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
              size_t buflen, int do_padding)
 {
   unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   unsigned int unused = c->u_mode.gcm.mac_unused;
   ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
   size_t nblocks, n;
   unsigned int burn = 0;
 
   if (buflen == 0 && (unused == 0 || !do_padding))
     return;
 
   do
     {
       if (buflen > 0 && (buflen + unused < blocksize || unused > 0))
         {
           n = blocksize - unused;
           n = n < buflen ? n : buflen;
 
           buf_cpy (&c->u_mode.gcm.macbuf[unused], buf, n);
 
           unused += n;
           buf += n;
           buflen -= n;
         }
       if (!buflen)
         {
           if (!do_padding && unused < blocksize)
 	    {
 	      break;
 	    }
 
 	  n = blocksize - unused;
 	  if (n > 0)
 	    {
 	      memset (&c->u_mode.gcm.macbuf[unused], 0, n);
 	      unused = blocksize;
 	    }
         }
 
       if (unused > 0)
         {
           gcry_assert (unused == blocksize);
 
           /* Process one block from macbuf.  */
           burn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
           unused = 0;
         }
 
       nblocks = buflen / blocksize;
 
       if (nblocks)
         {
           burn = ghash_fn (c, hash, buf, nblocks);
           buf += blocksize * nblocks;
           buflen -= blocksize * nblocks;
         }
     }
   while (buflen > 0);
 
   c->u_mode.gcm.mac_unused = unused;
 
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 static gcry_err_code_t
 gcm_ctr_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
                  const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err = 0;
 
   while (inbuflen)
     {
       u32 nblocks_to_overflow;
       u32 num_ctr_increments;
       u32 curr_ctr_low;
       size_t currlen = inbuflen;
       byte ctr_copy[GCRY_GCM_BLOCK_LEN];
       int fix_ctr = 0;
 
       /* GCM CTR increments only least significant 32-bits, without carry
        * to upper 96-bits of counter.  Using generic CTR implementation
        * directly would carry 32-bit overflow to upper 96-bit.  Detect
        * if input length is long enough to cause overflow, and limit
        * input length so that CTR overflow happen but updated CTR value is
        * not used to encrypt further input.  After overflow, upper 96 bits
        * of CTR are restored to cancel out modification done by generic CTR
        * encryption. */
 
       if (inbuflen > c->unused)
         {
           curr_ctr_low = gcm_add32_be128 (c->u_ctr.ctr, 0);
 
           /* Number of CTR increments this inbuflen would cause. */
           num_ctr_increments = (inbuflen - c->unused) / GCRY_GCM_BLOCK_LEN +
                                !!((inbuflen - c->unused) % GCRY_GCM_BLOCK_LEN);
 
           if ((u32)(num_ctr_increments + curr_ctr_low) < curr_ctr_low)
             {
               nblocks_to_overflow = 0xffffffffU - curr_ctr_low + 1;
               currlen = nblocks_to_overflow * GCRY_GCM_BLOCK_LEN + c->unused;
               if (currlen > inbuflen)
                 {
                   currlen = inbuflen;
                 }
 
               fix_ctr = 1;
               cipher_block_cpy(ctr_copy, c->u_ctr.ctr, GCRY_GCM_BLOCK_LEN);
             }
         }
 
       err = _gcry_cipher_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
       if (err != 0)
         return err;
 
       if (fix_ctr)
         {
           /* Lower 32-bits of CTR should now be zero. */
           gcry_assert(gcm_add32_be128 (c->u_ctr.ctr, 0) == 0);
 
           /* Restore upper part of CTR. */
           buf_cpy(c->u_ctr.ctr, ctr_copy, GCRY_GCM_BLOCK_LEN - sizeof(u32));
 
           wipememory(ctr_copy, sizeof(ctr_copy));
         }
 
       inbuflen -= currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       outbuf += currlen;
     }
 
   return err;
 }
 
 
 static gcry_err_code_t
 gcm_crypt_inner (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
 		 const byte *inbuf, size_t inbuflen, int encrypt)
 {
   gcry_err_code_t err;
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Use a bulk method if available.  */
       if (c->bulk.gcm_crypt)
 	{
 	  /* Bulk method requires that there is no cached data. */
 	  if (inbuflen >= GCRY_GCM_BLOCK_LEN && c->u_mode.gcm.mac_unused == 0)
 	    {
 	      size_t nblks = inbuflen / GCRY_GCM_BLOCK_LEN;
 	      size_t nleft;
 	      size_t ndone;
 
 	      nleft = c->bulk.gcm_crypt (c, outbuf, inbuf, nblks, encrypt);
 	      ndone = nblks - nleft;
 
 	      inbuf += ndone * GCRY_GCM_BLOCK_LEN;
 	      outbuf += ndone * GCRY_GCM_BLOCK_LEN;
 	      inbuflen -= ndone * GCRY_GCM_BLOCK_LEN;
 	      outbuflen -= ndone * GCRY_GCM_BLOCK_LEN;
 
 	      if (inbuflen == 0)
 		break;
 
 	      currlen = inbuflen;
 	    }
 	  else if (c->u_mode.gcm.mac_unused > 0
 	           && inbuflen >= GCRY_GCM_BLOCK_LEN
 			  + (16 - c->u_mode.gcm.mac_unused))
 	    {
 	      /* Handle just enough data so that cache is depleted, and on
 	       * next loop iteration use bulk method. */
 	      currlen = 16 - c->u_mode.gcm.mac_unused;
 
 	      gcry_assert(currlen);
 	    }
 	}
 
       /* Since checksumming is done after/before encryption/decryption,
        * process input in 24KiB chunks to keep data loaded in L1 cache for
-       * checksumming/decryption. */
-      if (currlen > 24 * 1024)
+       * checksumming/decryption.  However only do splitting if input is
+       * large enough so that last chunks does not end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       if (!encrypt)
 	do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, currlen, 0);
 
       err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
       if (err != 0)
 	return err;
 
       if (encrypt)
 	do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, outbuf, currlen, 0);
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
   static const unsigned char zerobuf[MAX_BLOCKSIZE];
 
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->marks.tag
       || c->u_mode.gcm.ghash_data_finalized
       || !c->u_mode.gcm.ghash_fn)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
 
   if (c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode)
     return GPG_ERR_INV_STATE;
 
   if (!c->u_mode.gcm.ghash_aad_finalized)
     {
       /* Start of encryption marks end of AAD stream. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
       c->u_mode.gcm.ghash_aad_finalized = 1;
     }
 
   gcm_bytecounter_add(c->u_mode.gcm.datalen, inbuflen);
   if (!gcm_check_datalen(c->u_mode.gcm.datalen))
     {
       c->u_mode.gcm.datalen_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   return gcm_crypt_inner (c, outbuf, outbuflen, inbuf, inbuflen, 1);
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
   static const unsigned char zerobuf[MAX_BLOCKSIZE];
 
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->marks.tag
       || c->u_mode.gcm.ghash_data_finalized
       || !c->u_mode.gcm.ghash_fn)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
 
   if (!c->u_mode.gcm.ghash_aad_finalized)
     {
       /* Start of decryption marks end of AAD stream. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
       c->u_mode.gcm.ghash_aad_finalized = 1;
     }
 
   gcm_bytecounter_add(c->u_mode.gcm.datalen, inbuflen);
   if (!gcm_check_datalen(c->u_mode.gcm.datalen))
     {
       c->u_mode.gcm.datalen_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   return gcm_crypt_inner (c, outbuf, outbuflen, inbuf, inbuflen, 0);
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
                                const byte * aadbuf, size_t aadbuflen)
 {
   static const unsigned char zerobuf[MAX_BLOCKSIZE];
 
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->marks.tag
       || c->u_mode.gcm.ghash_aad_finalized
       || c->u_mode.gcm.ghash_data_finalized
       || !c->u_mode.gcm.ghash_fn)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
 
   gcm_bytecounter_add(c->u_mode.gcm.aadlen, aadbuflen);
   if (!gcm_check_aadlen_or_ivlen(c->u_mode.gcm.aadlen))
     {
       c->u_mode.gcm.datalen_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, aadbuf, aadbuflen, 0);
 
   return 0;
 }
 
 
 void
 _gcry_cipher_gcm_setupM (gcry_cipher_hd_t c)
 {
   setupM (c);
 }
 
 
 void
 _gcry_cipher_gcm_setkey (gcry_cipher_hd_t c)
 {
   memset (c->u_mode.gcm.u_ghash_key.key, 0, GCRY_GCM_BLOCK_LEN);
 
   c->spec->encrypt (&c->context.c, c->u_mode.gcm.u_ghash_key.key,
                     c->u_mode.gcm.u_ghash_key.key);
   setupM (c);
 }
 
 
 static gcry_err_code_t
 _gcry_cipher_gcm_initiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
 {
   memset (c->u_mode.gcm.aadlen, 0, sizeof(c->u_mode.gcm.aadlen));
   memset (c->u_mode.gcm.datalen, 0, sizeof(c->u_mode.gcm.datalen));
   memset (c->u_mode.gcm.u_tag.tag, 0, GCRY_GCM_BLOCK_LEN);
   c->u_mode.gcm.datalen_over_limits = 0;
   c->u_mode.gcm.ghash_data_finalized = 0;
   c->u_mode.gcm.ghash_aad_finalized = 0;
 
   if (ivlen == 0)
     return GPG_ERR_INV_LENGTH;
 
   if (ivlen != GCRY_GCM_BLOCK_LEN - 4)
     {
       u32 iv_bytes[2] = {0, 0};
       u32 bitlengths[2][2];
 
       if (!c->u_mode.gcm.ghash_fn)
         return GPG_ERR_INV_STATE;
 
       memset(c->u_ctr.ctr, 0, GCRY_GCM_BLOCK_LEN);
 
       gcm_bytecounter_add(iv_bytes, ivlen);
       if (!gcm_check_aadlen_or_ivlen(iv_bytes))
         {
           c->u_mode.gcm.datalen_over_limits = 1;
           return GPG_ERR_INV_LENGTH;
         }
 
       do_ghash_buf(c, c->u_ctr.ctr, iv, ivlen, 1);
 
       /* iv length, 64-bit */
       bitlengths[1][1] = be_bswap32(iv_bytes[0] << 3);
       bitlengths[1][0] = be_bswap32((iv_bytes[0] >> 29) |
                                     (iv_bytes[1] << 3));
       /* zeros, 64-bit */
       bitlengths[0][1] = 0;
       bitlengths[0][0] = 0;
 
       do_ghash_buf(c, c->u_ctr.ctr, (byte*)bitlengths, GCRY_GCM_BLOCK_LEN, 1);
 
       wipememory (iv_bytes, sizeof iv_bytes);
       wipememory (bitlengths, sizeof bitlengths);
     }
   else
     {
       /* 96-bit IV is handled differently. */
       memcpy (c->u_ctr.ctr, iv, ivlen);
       c->u_ctr.ctr[12] = c->u_ctr.ctr[13] = c->u_ctr.ctr[14] = 0;
       c->u_ctr.ctr[15] = 1;
     }
 
   c->spec->encrypt (&c->context.c, c->u_mode.gcm.tagiv, c->u_ctr.ctr);
 
   gcm_add32_be128 (c->u_ctr.ctr, 1);
 
   c->unused = 0;
   c->marks.iv = 1;
   c->marks.tag = 0;
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
 {
   c->marks.iv = 0;
   c->marks.tag = 0;
   c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
 
   if (fips_mode ())
     {
       /* Direct invocation of GCM setiv in FIPS mode disables encryption. */
       c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 1;
     }
 
   return _gcry_cipher_gcm_initiv (c, iv, ivlen);
 }
 
 
 #if 0 && TODO
 void
 _gcry_cipher_gcm_geniv (gcry_cipher_hd_t c,
                         byte *ivout, size_t ivoutlen, const byte *nonce,
                         size_t noncelen)
 {
   /* nonce:    user provided part (might be null) */
   /* noncelen: check if proper length (if nonce not null) */
   /* ivout:    iv used to initialize gcm, output to user */
   /* ivoutlen: check correct size */
   byte iv[IVLEN];
 
   if (!ivout)
     return GPG_ERR_INV_ARG;
   if (ivoutlen != IVLEN)
     return GPG_ERR_INV_LENGTH;
   if (nonce != NULL && !is_nonce_ok_len(noncelen))
     return GPG_ERR_INV_ARG;
 
   gcm_generate_iv(iv, nonce, noncelen);
 
   c->marks.iv = 0;
   c->marks.tag = 0;
   c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
 
   _gcry_cipher_gcm_initiv (c, iv, IVLEN);
 
   buf_cpy(ivout, iv, IVLEN);
   wipememory(iv, sizeof(iv));
 }
 #endif
 
 
 static int
 is_tag_length_valid(size_t taglen)
 {
   switch (taglen)
     {
     /* Allowed tag lengths from NIST SP 800-38D.  */
     case 128 / 8: /* GCRY_GCM_BLOCK_LEN */
     case 120 / 8:
     case 112 / 8:
     case 104 / 8:
     case 96 / 8:
     case 64 / 8:
     case 32 / 8:
       return 1;
 
     default:
       return 0;
     }
 }
 
 static gcry_err_code_t
 _gcry_cipher_gcm_tag (gcry_cipher_hd_t c,
                       byte * outbuf, size_t outbuflen, int check)
 {
   if (!(is_tag_length_valid (outbuflen) || outbuflen >= GCRY_GCM_BLOCK_LEN))
     return GPG_ERR_INV_LENGTH;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.tag)
     {
       u32 bitlengths[2][2];
 
       if (!c->u_mode.gcm.ghash_fn)
         return GPG_ERR_INV_STATE;
 
       /* aad length */
       bitlengths[0][1] = be_bswap32(c->u_mode.gcm.aadlen[0] << 3);
       bitlengths[0][0] = be_bswap32((c->u_mode.gcm.aadlen[0] >> 29) |
                                     (c->u_mode.gcm.aadlen[1] << 3));
       /* data length */
       bitlengths[1][1] = be_bswap32(c->u_mode.gcm.datalen[0] << 3);
       bitlengths[1][0] = be_bswap32((c->u_mode.gcm.datalen[0] >> 29) |
                                     (c->u_mode.gcm.datalen[1] << 3));
 
       /* Finalize data-stream. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
       c->u_mode.gcm.ghash_aad_finalized = 1;
       c->u_mode.gcm.ghash_data_finalized = 1;
 
       /* Add bitlengths to tag. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, (byte*)bitlengths,
                    GCRY_GCM_BLOCK_LEN, 1);
       cipher_block_xor (c->u_mode.gcm.u_tag.tag, c->u_mode.gcm.tagiv,
                         c->u_mode.gcm.u_tag.tag, GCRY_GCM_BLOCK_LEN);
       c->marks.tag = 1;
 
       wipememory (bitlengths, sizeof (bitlengths));
       wipememory (c->u_mode.gcm.macbuf, GCRY_GCM_BLOCK_LEN);
       wipememory (c->u_mode.gcm.tagiv, GCRY_GCM_BLOCK_LEN);
       wipememory (c->u_mode.gcm.aadlen, sizeof (c->u_mode.gcm.aadlen));
       wipememory (c->u_mode.gcm.datalen, sizeof (c->u_mode.gcm.datalen));
     }
 
   if (!check)
     {
       if (outbuflen > GCRY_GCM_BLOCK_LEN)
         outbuflen = GCRY_GCM_BLOCK_LEN;
 
       /* NB: We already checked that OUTBUF is large enough to hold
        * the result or has valid truncated length.  */
       memcpy (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen);
     }
   else
     {
       /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
        * and thus we need to compare its length first.  */
       if (!is_tag_length_valid (outbuflen)
           || !buf_eq_const (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen))
         return GPG_ERR_CHECKSUM;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
                           size_t taglen)
 {
   /* Outputting authentication tag is part of encryption. */
   if (c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode)
     return GPG_ERR_INV_STATE;
 
   return _gcry_cipher_gcm_tag (c, outtag, taglen, 0);
 }
 
 gcry_err_code_t
 _gcry_cipher_gcm_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
                             size_t taglen)
 {
   return _gcry_cipher_gcm_tag (c, (unsigned char *) intag, taglen, 1);
 }
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index bfafa4c8..7a4cfbe1 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -1,762 +1,763 @@
 /* cipher-ocb.c -  OCB cipher mode
  * Copyright (C) 2015, 2016 g10 Code GmbH
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  *
  * OCB is covered by several patents but may be used freely by most
  * software.  See http://web.cs.ucdavis.edu/~rogaway/ocb/license.htm .
  * In particular license 1 is suitable for Libgcrypt: See
  * http://web.cs.ucdavis.edu/~rogaway/ocb/license1.pdf for the full
  * license document; it basically says:
  *
  *   License 1 — License for Open-Source Software Implementations of OCB
  *               (Jan 9, 2013)
  *
  *   Under this license, you are authorized to make, use, and
  *   distribute open-source software implementations of OCB. This
  *   license terminates for you if you sue someone over their
  *   open-source software implementation of OCB claiming that you have
  *   a patent covering their implementation.
  */
 
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
 
 /* Double the OCB_BLOCK_LEN sized block B in-place.  */
 static inline void
 double_block (u64 b[2])
 {
   u64 l_0, l, r;
 
   l = b[1];
   r = b[0];
 
   l_0 = -(l >> 63);
   l = (l + l) ^ (r >> 63);
   r = (r + r) ^ (l_0 & 135);
 
   b[1] = l;
   b[0] = r;
 }
 
 
 /* Copy OCB_BLOCK_LEN from buffer S starting at bit offset BITOFF to
  * buffer D.  */
 static void
 bit_copy (unsigned char d[16], const unsigned char s[24], unsigned int bitoff)
 {
   u64 s0l, s1l, s1r, s2r;
   unsigned int shift;
   unsigned int byteoff;
 
   byteoff = bitoff / 8;
   shift = bitoff % 8;
 
   s0l = buf_get_be64 (s + byteoff + 0);
   s1l = buf_get_be64 (s + byteoff + 8);
   s1r = shift ? s1l : 0;
   s2r = shift ? buf_get_be64 (s + 16) << (8 * byteoff) : 0;
 
   buf_put_be64 (d + 0, (s0l << shift) | (s1r >> ((64 - shift) & 63)));
   buf_put_be64 (d + 8, (s1l << shift) | (s2r >> ((64 - shift) & 63)));
 }
 
 
 /* Get L_big value for block N, where N is multiple of 65536. */
 static void
 ocb_get_L_big (gcry_cipher_hd_t c, u64 n, unsigned char *l_buf)
 {
   int ntz = _gcry_ctz64 (n);
   u64 L[2];
 
   gcry_assert(ntz >= OCB_L_TABLE_SIZE);
 
   L[1] = buf_get_be64 (c->u_mode.ocb.L[OCB_L_TABLE_SIZE - 1]);
   L[0] = buf_get_be64 (c->u_mode.ocb.L[OCB_L_TABLE_SIZE - 1] + 8);
 
   for (ntz -= OCB_L_TABLE_SIZE - 1; ntz; ntz--)
     double_block (L);
 
   buf_put_be64 (l_buf + 0, L[1]);
   buf_put_be64 (l_buf + 8, L[0]);
 }
 
 
 /* Called after key has been set. Sets up L table. */
 void
 _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c)
 {
   unsigned char ktop[OCB_BLOCK_LEN];
   unsigned int burn = 0;
   unsigned int nburn;
   u64 L[2];
   int i;
 
   /* L_star = E(zero_128) */
   memset (ktop, 0, OCB_BLOCK_LEN);
   nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.L_star, ktop);
   burn = nburn > burn ? nburn : burn;
   /* L_dollar = double(L_star)  */
   L[1] = buf_get_be64 (c->u_mode.ocb.L_star);
   L[0] = buf_get_be64 (c->u_mode.ocb.L_star + 8);
   double_block (L);
   buf_put_be64 (c->u_mode.ocb.L_dollar + 0, L[1]);
   buf_put_be64 (c->u_mode.ocb.L_dollar + 8, L[0]);
   /* L_0 = double(L_dollar), ...  */
   double_block (L);
   buf_put_be64 (c->u_mode.ocb.L[0] + 0, L[1]);
   buf_put_be64 (c->u_mode.ocb.L[0] + 8, L[0]);
   for (i = 1; i < OCB_L_TABLE_SIZE; i++)
     {
       double_block (L);
       buf_put_be64 (c->u_mode.ocb.L[i] + 0, L[1]);
       buf_put_be64 (c->u_mode.ocb.L[i] + 8, L[0]);
     }
   /* Precalculated offset L0+L1 */
   cipher_block_xor (c->u_mode.ocb.L0L1,
 		    c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN);
 
   /* Cleanup */
   wipememory (ktop, sizeof ktop);
   if (burn > 0)
     _gcry_burn_stack (burn + 4*sizeof(void*));
 }
 
 
 /* Set the nonce for OCB.  This requires that the key has been set.
    Using it again resets start a new encryption cycle using the same
    key.  */
 gcry_err_code_t
 _gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce,
                             size_t noncelen)
 {
   unsigned char ktop[OCB_BLOCK_LEN];
   unsigned char stretch[OCB_BLOCK_LEN + 8];
   unsigned int bottom;
   unsigned int burn = 0;
   unsigned int nburn;
 
   /* Check args.  */
   if (!c->marks.key)
     return GPG_ERR_INV_STATE;  /* Key must have been set first.  */
   switch (c->u_mode.ocb.taglen)
     {
     case 8:
     case 12:
     case 16:
       break;
     default:
       return GPG_ERR_BUG; /* Invalid tag length. */
     }
 
   if (c->spec->blocksize != OCB_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (!nonce)
     return GPG_ERR_INV_ARG;
   /* 120 bit is the allowed maximum.  In addition we impose a minimum
      of 64 bit.  */
   if (noncelen > (120/8) || noncelen < (64/8) || noncelen >= OCB_BLOCK_LEN)
     return GPG_ERR_INV_LENGTH;
 
   /* Prepare the nonce.  */
   memset (ktop, 0, OCB_BLOCK_LEN);
   buf_cpy (ktop + (OCB_BLOCK_LEN - noncelen), nonce, noncelen);
   ktop[0] = ((c->u_mode.ocb.taglen * 8) % 128) << 1;
   ktop[OCB_BLOCK_LEN - noncelen - 1] |= 1;
   bottom = ktop[OCB_BLOCK_LEN - 1] & 0x3f;
   ktop[OCB_BLOCK_LEN - 1] &= 0xc0; /* Zero the bottom bits.  */
   nburn = c->spec->encrypt (&c->context.c, ktop, ktop);
   burn = nburn > burn ? nburn : burn;
   /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
   cipher_block_cpy (stretch, ktop, OCB_BLOCK_LEN);
   cipher_block_xor (stretch + OCB_BLOCK_LEN, ktop, ktop + 1, 8);
   /* Offset_0 = Stretch[1+bottom..128+bottom]
      (We use the IV field to store the offset) */
   bit_copy (c->u_iv.iv, stretch, bottom);
   c->marks.iv = 1;
 
   /* Checksum_0 = zeros(128)
      (We use the CTR field to store the checksum) */
   memset (c->u_ctr.ctr, 0, OCB_BLOCK_LEN);
 
   /* Clear AAD buffer.  */
   memset (c->u_mode.ocb.aad_offset, 0, OCB_BLOCK_LEN);
   memset (c->u_mode.ocb.aad_sum, 0, OCB_BLOCK_LEN);
 
   /* Setup other values.  */
   memset (c->lastiv, 0, sizeof(c->lastiv));
   c->unused = 0;
   c->marks.tag = 0;
   c->marks.finalize = 0;
   c->u_mode.ocb.data_nblocks = 0;
   c->u_mode.ocb.aad_nblocks = 0;
   c->u_mode.ocb.aad_nleftover = 0;
   c->u_mode.ocb.data_finalized = 0;
   c->u_mode.ocb.aad_finalized = 0;
 
   /* log_printhex ("L_*       ", c->u_mode.ocb.L_star, OCB_BLOCK_LEN); */
   /* log_printhex ("L_$       ", c->u_mode.ocb.L_dollar, OCB_BLOCK_LEN); */
   /* log_printhex ("L_0       ", c->u_mode.ocb.L[0], OCB_BLOCK_LEN); */
   /* log_printhex ("L_1       ", c->u_mode.ocb.L[1], OCB_BLOCK_LEN); */
   /* log_debug (   "bottom    : %u (decimal)\n", bottom); */
   /* log_printhex ("Ktop      ", ktop, OCB_BLOCK_LEN); */
   /* log_printhex ("Stretch   ", stretch, sizeof stretch); */
   /* log_printhex ("Offset_0  ", c->u_iv.iv, OCB_BLOCK_LEN); */
 
   /* Cleanup */
   wipememory (ktop, sizeof ktop);
   wipememory (stretch, sizeof stretch);
   if (burn > 0)
     _gcry_burn_stack (burn + 4*sizeof(void*));
 
   return 0;
 }
 
 
 /* Process additional authentication data.  This implementation allows
    to add additional authentication data at any time before the final
    gcry_cipher_gettag.  */
 gcry_err_code_t
 _gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
                                size_t abuflen)
 {
   const size_t table_maxblks = 1 << OCB_L_TABLE_SIZE;
   const u32 table_size_mask = ((1 << OCB_L_TABLE_SIZE) - 1);
   unsigned char l_tmp[OCB_BLOCK_LEN];
   unsigned int burn = 0;
   unsigned int nburn;
   size_t n;
 
   /* Check that a nonce and thus a key has been set and that we have
      not yet computed the tag.  We also return an error if the aad has
      been finalized (i.e. a short block has been processed).  */
   if (!c->marks.iv || c->marks.tag || c->u_mode.ocb.aad_finalized)
     return GPG_ERR_INV_STATE;
 
   /* Check correct usage and arguments.  */
   if (c->spec->blocksize != OCB_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
 
   /* Process remaining data from the last call first.  */
   if (c->u_mode.ocb.aad_nleftover)
     {
       n = abuflen;
       if (n > OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover)
 	n = OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover;
 
       buf_cpy (&c->u_mode.ocb.aad_leftover[c->u_mode.ocb.aad_nleftover],
 	       abuf, n);
       c->u_mode.ocb.aad_nleftover += n;
       abuf += n;
       abuflen -= n;
 
       if (c->u_mode.ocb.aad_nleftover == OCB_BLOCK_LEN)
         {
           c->u_mode.ocb.aad_nblocks++;
 
           if ((c->u_mode.ocb.aad_nblocks % table_maxblks) == 0)
             {
               /* Table overflow, L needs to be generated. */
               ocb_get_L_big(c, c->u_mode.ocb.aad_nblocks + 1, l_tmp);
             }
           else
             {
               cipher_block_cpy (l_tmp, ocb_get_l (c, c->u_mode.ocb.aad_nblocks),
                                 OCB_BLOCK_LEN);
             }
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l_tmp, OCB_BLOCK_LEN);
           /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
           cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset,
                             c->u_mode.ocb.aad_leftover, OCB_BLOCK_LEN);
           nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
           burn = nburn > burn ? nburn : burn;
           cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
 
           c->u_mode.ocb.aad_nleftover = 0;
         }
     }
 
   if (!abuflen)
     {
       if (burn > 0)
         _gcry_burn_stack (burn + 4*sizeof(void*));
 
       return 0;
     }
 
   /* Full blocks handling. */
   while (abuflen >= OCB_BLOCK_LEN)
     {
       size_t nblks = abuflen / OCB_BLOCK_LEN;
       size_t nmaxblks;
 
       /* Check how many blocks to process till table overflow. */
       nmaxblks = (c->u_mode.ocb.aad_nblocks + 1) % table_maxblks;
       nmaxblks = (table_maxblks - nmaxblks) % table_maxblks;
 
       if (nmaxblks == 0)
         {
           /* Table overflow, generate L and process one block. */
           c->u_mode.ocb.aad_nblocks++;
           ocb_get_L_big(c, c->u_mode.ocb.aad_nblocks, l_tmp);
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l_tmp, OCB_BLOCK_LEN);
           /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
           cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf,
                             OCB_BLOCK_LEN);
           nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
           burn = nburn > burn ? nburn : burn;
           cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
 
           abuf += OCB_BLOCK_LEN;
           abuflen -= OCB_BLOCK_LEN;
           nblks--;
 
           /* With overflow handled, retry loop again. Next overflow will
            * happen after 65535 blocks. */
           continue;
         }
 
       nblks = nblks < nmaxblks ? nblks : nmaxblks;
 
       /* Use a bulk method if available.  */
       if (nblks && c->bulk.ocb_auth)
         {
           size_t nleft;
           size_t ndone;
 
           nleft = c->bulk.ocb_auth (c, abuf, nblks);
           ndone = nblks - nleft;
 
           abuf += ndone * OCB_BLOCK_LEN;
           abuflen -= ndone * OCB_BLOCK_LEN;
           nblks = nleft;
         }
 
       /* Hash all full blocks.  */
       while (nblks)
         {
           c->u_mode.ocb.aad_nblocks++;
 
           gcry_assert(c->u_mode.ocb.aad_nblocks & table_size_mask);
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           cipher_block_xor_1 (c->u_mode.ocb.aad_offset,
                               ocb_get_l (c, c->u_mode.ocb.aad_nblocks),
                               OCB_BLOCK_LEN);
           /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
           cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf,
                             OCB_BLOCK_LEN);
           nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
           burn = nburn > burn ? nburn : burn;
           cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
 
           abuf += OCB_BLOCK_LEN;
           abuflen -= OCB_BLOCK_LEN;
           nblks--;
         }
     }
 
   /* Store away the remaining data.  */
   if (abuflen)
     {
       n = abuflen;
       if (n > OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover)
 	n = OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover;
 
       buf_cpy (&c->u_mode.ocb.aad_leftover[c->u_mode.ocb.aad_nleftover],
 	       abuf, n);
       c->u_mode.ocb.aad_nleftover += n;
       abuf += n;
       abuflen -= n;
     }
 
   gcry_assert (!abuflen);
 
   if (burn > 0)
     _gcry_burn_stack (burn + 4*sizeof(void*));
 
   return 0;
 }
 
 
 /* Hash final partial AAD block.  */
 static void
 ocb_aad_finalize (gcry_cipher_hd_t c)
 {
   unsigned char l_tmp[OCB_BLOCK_LEN];
   unsigned int burn = 0;
   unsigned int nburn;
 
   /* Check that a nonce and thus a key has been set and that we have
      not yet computed the tag.  We also skip this if the aad has been
      finalized.  */
   if (!c->marks.iv || c->marks.tag || c->u_mode.ocb.aad_finalized)
     return;
   if (c->spec->blocksize != OCB_BLOCK_LEN)
     return;  /* Ooops.  */
 
   /* Hash final partial block if any.  */
   if (c->u_mode.ocb.aad_nleftover)
     {
       /* Offset_* = Offset_m xor L_*  */
       cipher_block_xor_1 (c->u_mode.ocb.aad_offset,
                           c->u_mode.ocb.L_star, OCB_BLOCK_LEN);
       /* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_*  */
       buf_cpy (l_tmp, c->u_mode.ocb.aad_leftover, c->u_mode.ocb.aad_nleftover);
       memset (l_tmp + c->u_mode.ocb.aad_nleftover, 0,
               OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover);
       l_tmp[c->u_mode.ocb.aad_nleftover] = 0x80;
       cipher_block_xor_1 (l_tmp, c->u_mode.ocb.aad_offset, OCB_BLOCK_LEN);
       /* Sum = Sum_m xor ENCIPHER(K, CipherInput)  */
       nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
       burn = nburn > burn ? nburn : burn;
       cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
 
       c->u_mode.ocb.aad_nleftover = 0;
     }
 
   /* Mark AAD as finalized so that gcry_cipher_ocb_authenticate can
    * return an erro when called again.  */
   c->u_mode.ocb.aad_finalized = 1;
 
   if (burn > 0)
     _gcry_burn_stack (burn + 4*sizeof(void*));
 }
 
 
 
 /* Checksumming for encrypt and decrypt.  */
 static void
 ocb_checksum (unsigned char *chksum, const unsigned char *plainbuf,
               size_t nblks)
 {
   while (nblks > 0)
     {
       /* Checksum_i = Checksum_{i-1} xor P_i  */
       cipher_block_xor_1(chksum, plainbuf, OCB_BLOCK_LEN);
 
       plainbuf += OCB_BLOCK_LEN;
       nblks--;
     }
 }
 
 
 /* Common code for encrypt and decrypt.  */
 static gcry_err_code_t
 ocb_crypt (gcry_cipher_hd_t c, int encrypt,
            unsigned char *outbuf, size_t outbuflen,
            const unsigned char *inbuf, size_t inbuflen)
 {
   const size_t table_maxblks = 1 << OCB_L_TABLE_SIZE;
   const u32 table_size_mask = ((1 << OCB_L_TABLE_SIZE) - 1);
   unsigned char l_tmp[OCB_BLOCK_LEN];
   unsigned int burn = 0;
   unsigned int nburn;
   gcry_cipher_encrypt_t crypt_fn =
       encrypt ? c->spec->encrypt : c->spec->decrypt;
 
   /* Check that a nonce and thus a key has been set and that we are
      not yet in end of data state. */
   if (!c->marks.iv || c->u_mode.ocb.data_finalized)
     return GPG_ERR_INV_STATE;
 
   /* Check correct usage and arguments.  */
   if (c->spec->blocksize != OCB_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->marks.finalize)
     ; /* Allow arbitarty length. */
   else if ((inbuflen % OCB_BLOCK_LEN))
     return GPG_ERR_INV_LENGTH;  /* We support only full blocks for now.  */
 
   /* Full blocks handling. */
   while (inbuflen >= OCB_BLOCK_LEN)
     {
       size_t nblks = inbuflen / OCB_BLOCK_LEN;
       size_t nmaxblks;
 
       /* Check how many blocks to process till table overflow. */
       nmaxblks = (c->u_mode.ocb.data_nblocks + 1) % table_maxblks;
       nmaxblks = (table_maxblks - nmaxblks) % table_maxblks;
 
       if (nmaxblks == 0)
         {
           /* Table overflow, generate L and process one block. */
           c->u_mode.ocb.data_nblocks++;
           ocb_get_L_big(c, c->u_mode.ocb.data_nblocks, l_tmp);
 
           if (encrypt)
             {
               /* Checksum_i = Checksum_{i-1} xor P_i  */
               ocb_checksum (c->u_ctr.ctr, inbuf, 1);
             }
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           cipher_block_xor_1 (c->u_iv.iv, l_tmp, OCB_BLOCK_LEN);
           /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
           cipher_block_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN);
           nburn = crypt_fn (&c->context.c, outbuf, outbuf);
           burn = nburn > burn ? nburn : burn;
           cipher_block_xor_1 (outbuf, c->u_iv.iv, OCB_BLOCK_LEN);
 
           if (!encrypt)
             {
               /* Checksum_i = Checksum_{i-1} xor P_i  */
               ocb_checksum (c->u_ctr.ctr, outbuf, 1);
             }
 
           inbuf += OCB_BLOCK_LEN;
           inbuflen -= OCB_BLOCK_LEN;
           outbuf += OCB_BLOCK_LEN;
           outbuflen =- OCB_BLOCK_LEN;
           nblks--;
 
           /* With overflow handled, retry loop again. Next overflow will
            * happen after 65535 blocks. */
           continue;
         }
 
       nblks = nblks < nmaxblks ? nblks : nmaxblks;
 
       /* Since checksum xoring is done before/after encryption/decryption,
-	process input in 24KiB chunks to keep data loaded in L1 cache for
-	checksumming. */
-      if (nblks > 24 * 1024 / OCB_BLOCK_LEN)
+	 process input in 24KiB chunks to keep data loaded in L1 cache for
+	 checksumming.  However only do splitting if input is large enough
+	 so that last chunks does not end up being short. */
+      if (nblks > 32 * 1024 / OCB_BLOCK_LEN)
 	nblks = 24 * 1024 / OCB_BLOCK_LEN;
 
       /* Use a bulk method if available.  */
       if (nblks && c->bulk.ocb_crypt)
         {
           size_t nleft;
           size_t ndone;
 
           nleft = c->bulk.ocb_crypt (c, outbuf, inbuf, nblks, encrypt);
           ndone = nblks - nleft;
 
           inbuf += ndone * OCB_BLOCK_LEN;
           outbuf += ndone * OCB_BLOCK_LEN;
           inbuflen -= ndone * OCB_BLOCK_LEN;
           outbuflen -= ndone * OCB_BLOCK_LEN;
           nblks = nleft;
         }
 
       if (nblks)
         {
           size_t nblks_chksum = nblks;
 
           if (encrypt)
             {
               /* Checksum_i = Checksum_{i-1} xor P_i  */
               ocb_checksum (c->u_ctr.ctr, inbuf, nblks_chksum);
             }
 
           /* Encrypt all full blocks.  */
           while (nblks)
             {
               c->u_mode.ocb.data_nblocks++;
 
               gcry_assert(c->u_mode.ocb.data_nblocks & table_size_mask);
 
               /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
               cipher_block_xor_1 (c->u_iv.iv,
                                   ocb_get_l (c, c->u_mode.ocb.data_nblocks),
                                   OCB_BLOCK_LEN);
               /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
               cipher_block_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN);
               nburn = crypt_fn (&c->context.c, outbuf, outbuf);
               burn = nburn > burn ? nburn : burn;
               cipher_block_xor_1 (outbuf, c->u_iv.iv, OCB_BLOCK_LEN);
 
               inbuf += OCB_BLOCK_LEN;
               inbuflen -= OCB_BLOCK_LEN;
               outbuf += OCB_BLOCK_LEN;
               outbuflen =- OCB_BLOCK_LEN;
               nblks--;
             }
 
           if (!encrypt)
             {
               /* Checksum_i = Checksum_{i-1} xor P_i  */
               ocb_checksum (c->u_ctr.ctr,
                             outbuf - nblks_chksum * OCB_BLOCK_LEN,
                             nblks_chksum);
             }
         }
     }
 
   /* Encrypt final partial block.  Note that we expect INBUFLEN to be
      shorter than OCB_BLOCK_LEN (see above).  */
   if (inbuflen)
     {
       unsigned char pad[OCB_BLOCK_LEN];
 
       /* Offset_* = Offset_m xor L_*  */
       cipher_block_xor_1 (c->u_iv.iv, c->u_mode.ocb.L_star, OCB_BLOCK_LEN);
       /* Pad = ENCIPHER(K, Offset_*) */
       nburn = c->spec->encrypt (&c->context.c, pad, c->u_iv.iv);
       burn = nburn > burn ? nburn : burn;
 
       if (encrypt)
         {
           /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
           /* Note that INBUFLEN is less than OCB_BLOCK_LEN.  */
           buf_cpy (l_tmp, inbuf, inbuflen);
           memset (l_tmp + inbuflen, 0, OCB_BLOCK_LEN - inbuflen);
           l_tmp[inbuflen] = 0x80;
           cipher_block_xor_1 (c->u_ctr.ctr, l_tmp, OCB_BLOCK_LEN);
           /* C_* = P_* xor Pad[1..bitlen(P_*)] */
           buf_xor (outbuf, inbuf, pad, inbuflen);
         }
       else
         {
           /* P_* = C_* xor Pad[1..bitlen(C_*)] */
           /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
           cipher_block_cpy (l_tmp, pad, OCB_BLOCK_LEN);
           buf_cpy (l_tmp, inbuf, inbuflen);
           cipher_block_xor_1 (l_tmp, pad, OCB_BLOCK_LEN);
           l_tmp[inbuflen] = 0x80;
           buf_cpy (outbuf, l_tmp, inbuflen);
 
           cipher_block_xor_1 (c->u_ctr.ctr, l_tmp, OCB_BLOCK_LEN);
         }
     }
 
   /* Compute the tag if the finalize flag has been set.  */
   if (c->marks.finalize)
     {
       /* Tag = ENCIPHER(K, Checksum xor Offset xor L_$) xor HASH(K,A) */
       cipher_block_xor (c->u_mode.ocb.tag, c->u_ctr.ctr, c->u_iv.iv,
                         OCB_BLOCK_LEN);
       cipher_block_xor_1 (c->u_mode.ocb.tag, c->u_mode.ocb.L_dollar,
                           OCB_BLOCK_LEN);
       nburn = c->spec->encrypt (&c->context.c,
                                 c->u_mode.ocb.tag, c->u_mode.ocb.tag);
       burn = nburn > burn ? nburn : burn;
 
       c->u_mode.ocb.data_finalized = 1;
       /* Note that the the final part of the tag computation is done
          by _gcry_cipher_ocb_get_tag.  */
     }
 
   if (burn > 0)
     _gcry_burn_stack (burn + 4*sizeof(void*));
 
   return 0;
 }
 
 
 /* Encrypt (INBUF,INBUFLEN) in OCB mode to OUTBUF.  OUTBUFLEN gives
    the allocated size of OUTBUF.  This function accepts only multiples
    of a full block unless gcry_cipher_final has been called in which
    case the next block may have any length.  */
 gcry_err_code_t
 _gcry_cipher_ocb_encrypt (gcry_cipher_hd_t c,
                           unsigned char *outbuf, size_t outbuflen,
                           const unsigned char *inbuf, size_t inbuflen)
 
 {
   return ocb_crypt (c, 1, outbuf, outbuflen, inbuf, inbuflen);
 }
 
 
 /* Decrypt (INBUF,INBUFLEN) in OCB mode to OUTBUF.  OUTBUFLEN gives
    the allocated size of OUTBUF.  This function accepts only multiples
    of a full block unless gcry_cipher_final has been called in which
    case the next block may have any length.  */
 gcry_err_code_t
 _gcry_cipher_ocb_decrypt (gcry_cipher_hd_t c,
                           unsigned char *outbuf, size_t outbuflen,
                           const unsigned char *inbuf, size_t inbuflen)
 {
   return ocb_crypt (c, 0, outbuf, outbuflen, inbuf, inbuflen);
 }
 
 
 /* Compute the tag.  The last data operation has already done some
    part of it.  To allow adding AAD even after having done all data,
    we finish the tag computation only here.  */
 static void
 compute_tag_if_needed (gcry_cipher_hd_t c)
 {
   if (!c->marks.tag)
     {
       ocb_aad_finalize (c);
       cipher_block_xor_1 (c->u_mode.ocb.tag, c->u_mode.ocb.aad_sum,
                           OCB_BLOCK_LEN);
       c->marks.tag = 1;
     }
 }
 
 
 /* Copy the already computed tag to OUTTAG.  OUTTAGSIZE is the
    allocated size of OUTTAG; the function returns an error if that is
    too short to hold the tag.  */
 gcry_err_code_t
 _gcry_cipher_ocb_get_tag (gcry_cipher_hd_t c,
                           unsigned char *outtag, size_t outtagsize)
 {
   if (c->u_mode.ocb.taglen > outtagsize)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (!c->u_mode.ocb.data_finalized)
     return GPG_ERR_INV_STATE; /* Data has not yet been finalized.  */
 
   compute_tag_if_needed (c);
 
   memcpy (outtag, c->u_mode.ocb.tag, c->u_mode.ocb.taglen);
 
   return 0;
 }
 
 
 /* Check that the tag (INTAG,TAGLEN) matches the computed tag for the
    handle C.  */
 gcry_err_code_t
 _gcry_cipher_ocb_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
 			    size_t taglen)
 {
   size_t n;
 
   if (!c->u_mode.ocb.data_finalized)
     return GPG_ERR_INV_STATE; /* Data has not yet been finalized.  */
 
   compute_tag_if_needed (c);
 
   n = c->u_mode.ocb.taglen;
   if (taglen < n)
     n = taglen;
 
   if (!buf_eq_const (intag, c->u_mode.ocb.tag, n)
       || c->u_mode.ocb.taglen != taglen)
     return GPG_ERR_CHECKSUM;
 
   return 0;
 }
diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c
index bb475236..5cd3561b 100644
--- a/cipher/cipher-poly1305.c
+++ b/cipher/cipher-poly1305.c
@@ -1,375 +1,379 @@
 /* cipher-poly1305.c  -  Poly1305 based AEAD cipher mode, RFC-8439
  * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 #include "./poly1305-internal.h"
 
 
 static inline int
 poly1305_bytecounter_add (u32 ctr[2], size_t add)
 {
   int overflow = 0;
 
   if (sizeof(add) > sizeof(u32))
     {
       u32 high_add = ((add >> 31) >> 1) & 0xffffffff;
       ctr[1] += high_add;
       if (ctr[1] < high_add)
         overflow = 1;
     }
 
   ctr[0] += add;
   if (ctr[0] >= add)
     return overflow;
 
   ctr[1] += 1;
   return (ctr[1] < 1) || overflow;
 }
 
 
 static void
 poly1305_fill_bytecounts (gcry_cipher_hd_t c)
 {
   u32 lenbuf[4];
 
   lenbuf[0] = le_bswap32(c->u_mode.poly1305.aadcount[0]);
   lenbuf[1] = le_bswap32(c->u_mode.poly1305.aadcount[1]);
   lenbuf[2] = le_bswap32(c->u_mode.poly1305.datacount[0]);
   lenbuf[3] = le_bswap32(c->u_mode.poly1305.datacount[1]);
   _gcry_poly1305_update (&c->u_mode.poly1305.ctx, (byte*)lenbuf,
 			 sizeof(lenbuf));
 
   wipememory(lenbuf, sizeof(lenbuf));
 }
 
 
 static void
 poly1305_do_padding (gcry_cipher_hd_t c, u32 ctr[2])
 {
   static const byte zero_padding_buf[15] = {};
   u32 padding_count;
 
   /* Padding to 16 byte boundary. */
   if (ctr[0] % 16 > 0)
     {
       padding_count = 16 - ctr[0] % 16;
 
       _gcry_poly1305_update (&c->u_mode.poly1305.ctx, zero_padding_buf,
 			     padding_count);
     }
 }
 
 
 static void
 poly1305_aad_finish (gcry_cipher_hd_t c)
 {
   /* After AAD, feed padding bytes so we get 16 byte alignment. */
   poly1305_do_padding (c, c->u_mode.poly1305.aadcount);
 
   /* Start of encryption marks end of AAD stream. */
   c->u_mode.poly1305.aad_finalized = 1;
 
   c->u_mode.poly1305.datacount[0] = 0;
   c->u_mode.poly1305.datacount[1] = 0;
 }
 
 
 static gcry_err_code_t
 poly1305_set_zeroiv (gcry_cipher_hd_t c)
 {
   byte zero[8] = { 0, };
 
   return _gcry_cipher_poly1305_setiv (c, zero, sizeof(zero));
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_authenticate (gcry_cipher_hd_t c,
 				    const byte * aadbuf, size_t aadbuflen)
 {
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->u_mode.poly1305.aad_finalized)
     return GPG_ERR_INV_STATE;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     poly1305_set_zeroiv(c);
 
   if (poly1305_bytecounter_add(c->u_mode.poly1305.aadcount, aadbuflen))
     {
       c->u_mode.poly1305.bytecount_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   _gcry_poly1305_update (&c->u_mode.poly1305.ctx, aadbuf, aadbuflen);
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c,
 			       byte *outbuf, size_t outbuflen,
 			       const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.iv)
     {
       err = poly1305_set_zeroiv(c);
       if (err)
         return err;
     }
 
   if (!c->u_mode.poly1305.aad_finalized)
     poly1305_aad_finish(c);
 
   if (poly1305_bytecounter_add(c->u_mode.poly1305.datacount, inbuflen))
     {
       c->u_mode.poly1305.bytecount_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
     {
       return _gcry_chacha20_poly1305_encrypt (c, outbuf, inbuf, inbuflen);
     }
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done after encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming.  However
+       * only do splitting if input is large enough so that last chunks does
+       * not end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
 
       _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, currlen);
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c,
 			       byte *outbuf, size_t outbuflen,
 			       const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->marks.tag)
     return GPG_ERR_INV_STATE;
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.iv)
     {
       err = poly1305_set_zeroiv(c);
       if (err)
         return err;
     }
 
   if (!c->u_mode.poly1305.aad_finalized)
     poly1305_aad_finish(c);
 
   if (poly1305_bytecounter_add(c->u_mode.poly1305.datacount, inbuflen))
     {
       c->u_mode.poly1305.bytecount_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
     {
       return _gcry_chacha20_poly1305_decrypt (c, outbuf, inbuf, inbuflen);
     }
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Since checksumming is done before decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for decryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for decryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
 	currlen = 24 * 1024;
 
       _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, currlen);
 
       c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 static gcry_err_code_t
 _gcry_cipher_poly1305_tag (gcry_cipher_hd_t c,
 			   byte * outbuf, size_t outbuflen, int check)
 {
   gcry_err_code_t err;
 
   if (outbuflen < POLY1305_TAGLEN)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->u_mode.poly1305.bytecount_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.iv)
     {
       err = poly1305_set_zeroiv(c);
       if (err)
         return err;
     }
 
   if (!c->u_mode.poly1305.aad_finalized)
     poly1305_aad_finish(c);
 
   if (!c->marks.tag)
     {
       /* After data, feed padding bytes so we get 16 byte alignment. */
       poly1305_do_padding (c, c->u_mode.poly1305.datacount);
 
       /* Write byte counts to poly1305. */
       poly1305_fill_bytecounts(c);
 
       _gcry_poly1305_finish(&c->u_mode.poly1305.ctx, c->u_iv.iv);
 
       c->marks.tag = 1;
     }
 
   if (!check)
     {
       memcpy (outbuf, c->u_iv.iv, POLY1305_TAGLEN);
     }
   else
     {
       /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
        * and thus we need to compare its length first.  */
       if (outbuflen != POLY1305_TAGLEN
           || !buf_eq_const (outbuf, c->u_iv.iv, POLY1305_TAGLEN))
         return GPG_ERR_CHECKSUM;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
                           size_t taglen)
 {
   return _gcry_cipher_poly1305_tag (c, outtag, taglen, 0);
 }
 
 gcry_err_code_t
 _gcry_cipher_poly1305_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
                             size_t taglen)
 {
   return _gcry_cipher_poly1305_tag (c, (unsigned char *) intag, taglen, 1);
 }
 
 
 void
 _gcry_cipher_poly1305_setkey (gcry_cipher_hd_t c)
 {
   c->u_mode.poly1305.aadcount[0] = 0;
   c->u_mode.poly1305.aadcount[1] = 0;
 
   c->u_mode.poly1305.datacount[0] = 0;
   c->u_mode.poly1305.datacount[1] = 0;
 
   c->u_mode.poly1305.bytecount_over_limits = 0;
   c->u_mode.poly1305.aad_finalized = 0;
   c->marks.tag = 0;
   c->marks.iv = 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_poly1305_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
 {
   byte tmpbuf[64]; /* size of ChaCha20 block */
   gcry_err_code_t err;
 
   /* IV must be 96-bits */
   if (!iv && ivlen != (96 / 8))
     return GPG_ERR_INV_ARG;
 
   memset(&c->u_mode.poly1305.ctx, 0, sizeof(c->u_mode.poly1305.ctx));
 
   c->u_mode.poly1305.aadcount[0] = 0;
   c->u_mode.poly1305.aadcount[1] = 0;
 
   c->u_mode.poly1305.datacount[0] = 0;
   c->u_mode.poly1305.datacount[1] = 0;
 
   c->u_mode.poly1305.bytecount_over_limits = 0;
   c->u_mode.poly1305.aad_finalized = 0;
   c->marks.tag = 0;
   c->marks.iv = 0;
 
   /* Set up IV for stream cipher. */
   c->spec->setiv (&c->context.c, iv, ivlen);
 
   /* Get the first block from ChaCha20. */
   memset(tmpbuf, 0, sizeof(tmpbuf));
   c->spec->stencrypt(&c->context.c, tmpbuf, tmpbuf, sizeof(tmpbuf));
 
   /* Use the first 32-bytes as Poly1305 key. */
   err = _gcry_poly1305_init (&c->u_mode.poly1305.ctx, tmpbuf, POLY1305_KEYLEN);
 
   wipememory(tmpbuf, sizeof(tmpbuf));
 
   if (err)
     return err;
 
   c->marks.iv = 1;
   return 0;
 }