diff --git a/cipher/keccak-armv7-neon.S b/cipher/keccak-armv7-neon.S
index 0bec8d50..28a284a1 100644
--- a/cipher/keccak-armv7-neon.S
+++ b/cipher/keccak-armv7-neon.S
@@ -1,945 +1,945 @@
 /* keccak-armv7-neon.S  -  ARMv7/NEON implementation of Keccak
  *
  * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_NEON)
 
 /* Based on public-domain/CC0 implementation from SUPERCOP package
  * (keccakc1024/inplace-armv7a-neon/keccak2.s)
  *
  * Original copyright header follows:
  */
 
 @ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
 @ Michaël Peeters and Gilles Van Assche. For more information, feedback or
 @ questions, please refer to our website: http://keccak.noekeon.org/
 @
 @ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
 @
 @ To the extent possible under law, the implementer has waived all copyright
 @ and related or neighboring rights to the source code in this file.
 @ http://creativecommons.org/publicdomain/zero/1.0/
 
 .text
 
 .syntax unified
 .fpu neon
 .arm
 
 
 .extern _gcry_keccak_round_consts_64bit;
 
 #ifdef __PIC__
 #  define GET_DATA_POINTER(reg, name, rtmp) \
 		ldr reg, 1f; \
 		ldr rtmp, 2f; \
 		b 3f; \
 	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
 	2:	.word name(GOT); \
 	3:	add reg, pc, reg; \
 		ldr reg, [reg, rtmp];
 #else
 #  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
 #endif
 
 
 @//  --- offsets in state
 .equ Aba, 0*8
 .equ Aga, 1*8
 .equ Aka, 2*8
 .equ Ama, 3*8
 .equ Asa, 4*8
 
 @// --- macros
 
 .macro    KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5
 
     @Prepare Theta
     @Ca = Aba^Aga^Aka^Ama^Asa@
     @Ce = Abe^Age^Ake^Ame^Ase@
     @Ci = Abi^Agi^Aki^Ami^Asi@
     @Co = Abo^Ago^Ako^Amo^Aso@
     @Cu = Abu^Agu^Aku^Amu^Asu@
     @De = Ca^ROL64(Ci, 1)@
     @Di = Ce^ROL64(Co, 1)@
     @Do = Ci^ROL64(Cu, 1)@
     @Du = Co^ROL64(Ca, 1)@
     @Da = Cu^ROL64(Ce, 1)@
 
     veor.64 q4, q6, q7
     veor.64 q5, q9, q10
     veor.64 d8,  d8,   d9
     veor.64 d10,  d10,   d11
     veor.64 d1,  d8,   d16
     veor.64 d2,  d10,   d17
 
     veor.64 q4, q11, q12
     veor.64 q5, q14, q15
     veor.64 d8,  d8,   d9
     veor.64 d10,  d10,   d11
     veor.64 d3,  d8,   d26
 
     vadd.u64 q4, q1, q1
     veor.64 d4,  d10,   d27
     vmov.64  d0, d5
     vsri.64 q4, q1, #63
 
     vadd.u64 q5, q2, q2
     veor.64 q4, q4, q0
     vsri.64 q5, q2, #63
     vadd.u64 d7, d1, d1
     veor.64 \argA2, \argA2, d8
     veor.64 q5, q5, q1
 
     vsri.64 d7, d1, #63
     vshl.u64 d1, \argA2, #44
     veor.64 \argA3, \argA3, d9
     veor.64 d7, d7, d4
 
     @Ba = argA1^Da@
     @Be = ROL64((argA2^De), 44)@
     @Bi = ROL64((argA3^Di), 43)@
     @Bo = ROL64((argA4^Do), 21)@
     @Bu = ROL64((argA5^Du), 14)@
     @argA2 =   Be ^((~Bi)& Bo )@
     @argA3 =   Bi ^((~Bo)& Bu )@
     @argA4 =   Bo ^((~Bu)& Ba )@
     @argA5 =   Bu ^((~Ba)& Be )@
     @argA1 =   Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@
     vsri.64 d1, \argA2, #64-44
     vshl.u64 d2, \argA3, #43
     vldr.64 d0, [sp, #\argA1]
     veor.64 \argA4, \argA4, d10
     vsri.64 d2, \argA3, #64-43
     vshl.u64 d3, \argA4, #21
     veor.64 \argA5, \argA5, d11
     veor.64 d0, d0, d7
     vsri.64 d3, \argA4, #64-21
     vbic.64   d5, d2, d1
     vshl.u64 d4, \argA5, #14
     vbic.64   \argA2, d3, d2
     vld1.64   d6, [ip]!
     veor.64   d5, d0
     vsri.64 d4, \argA5, #64-14
     veor.64   d5, d6
     vbic.64   \argA5, d1, d0
     vbic.64   \argA3, d4, d3
     vbic.64   \argA4, d0, d4
     veor.64   \argA2, d1
     vstr.64   d5, [sp, #\argA1]
     veor.64   \argA3, d2
     veor.64   \argA4, d3
     veor.64   \argA5, d4
 
     .endm
 
 .macro    KeccakThetaRhoPiChi1   argA1, argA2, argA3, argA4, argA5
 
     @d2 = ROL64((argA1^Da), 3)@
     @d3 = ROL64((argA2^De), 45)@
     @d4 = ROL64((argA3^Di), 61)@
     @d0 = ROL64((argA4^Do), 28)@
     @d1 = ROL64((argA5^Du), 20)@
     @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
     @argA2 =   Be ^((~Bi)&  Bo )@
     @argA3 =   Bi ^((~Bo)&  Bu )@
     @argA4 =   Bo ^((~Bu)&  Ba )@
     @argA5 =   Bu ^((~Ba)&  Be )@
 
     veor.64 \argA2, \argA2, d8
     veor.64 \argA3, \argA3, d9
     vshl.u64  d3, \argA2, #45
     vldr.64 d6, [sp, #\argA1]
     vshl.u64  d4, \argA3, #61
     veor.64 \argA4, \argA4, d10
     vsri.64  d3, \argA2, #64-45
     veor.64 \argA5, \argA5, d11
     vsri.64  d4, \argA3, #64-61
     vshl.u64  d0, \argA4, #28
     veor.64 d6, d6, d7
     vshl.u64  d1, \argA5, #20
     vbic.64   \argA3, d4, d3
     vsri.64  d0, \argA4, #64-28
     vbic.64   \argA4, d0, d4
     vshl.u64  d2, d6, #3
     vsri.64  d1, \argA5, #64-20
     veor.64   \argA4, d3
     vsri.64  d2, d6, #64-3
     vbic.64   \argA5, d1, d0
     vbic.64   d6, d2, d1
     vbic.64   \argA2, d3, d2
     veor.64   d6, d0
     veor.64   \argA2, d1
     vstr.64   d6, [sp, #\argA1]
     veor.64   \argA3, d2
     veor.64  d5, d6
     veor.64   \argA5, d4
 
     .endm
 
 .macro    KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5
 
     @d4 = ROL64((argA1^Da), 18)@
     @d0 = ROL64((argA2^De), 1)@
     @d1 = ROL64((argA3^Di), 6)@
     @d2 = ROL64((argA4^Do), 25)@
     @d3 = ROL64((argA5^Du), 8)@
     @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
     @argA2 =   Be ^((~Bi)&  Bo )@
     @argA3 =   Bi ^((~Bo)&  Bu )@
     @argA4 =   Bo ^((~Bu)&  Ba )@
     @argA5 =   Bu ^((~Ba)&  Be )@
 
     veor.64 \argA3, \argA3, d9
     veor.64 \argA4, \argA4, d10
     vshl.u64  d1, \argA3, #6
     vldr.64 d6, [sp, #\argA1]
     vshl.u64  d2, \argA4, #25
     veor.64 \argA5, \argA5, d11
     vsri.64  d1, \argA3, #64-6
     veor.64 \argA2, \argA2, d8
     vsri.64  d2, \argA4, #64-25
     vext.8  d3, \argA5, \argA5, #7
     veor.64 d6, d6, d7
     vbic.64  \argA3, d2, d1
     vadd.u64  d0, \argA2, \argA2
     vbic.64   \argA4, d3, d2
     vsri.64  d0, \argA2, #64-1
     vshl.u64  d4, d6, #18
     veor.64  \argA2, d1, \argA4
     veor.64  \argA3, d0
     vsri.64  d4, d6, #64-18
     vstr.64   \argA3, [sp, #\argA1]
     veor.64  d5, \argA3
     vbic.64   \argA5, d1, d0
     vbic.64   \argA3, d4, d3
     vbic.64   \argA4, d0, d4
     veor.64   \argA3, d2
     veor.64   \argA4, d3
     veor.64   \argA5, d4
 
     .endm
 
 .macro    KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5
 
     @d1 = ROL64((argA1^Da), 36)@
     @d2 = ROL64((argA2^De), 10)@
     @d3 = ROL64((argA3^Di), 15)@
     @d4 = ROL64((argA4^Do), 56)@
     @d0 = ROL64((argA5^Du), 27)@
     @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
     @argA2 =   Be ^((~Bi)&  Bo )@
     @argA3 =   Bi ^((~Bo)&  Bu )@
     @argA4 =   Bo ^((~Bu)&  Ba )@
     @argA5 =   Bu ^((~Ba)&  Be )@
 
     veor.64 \argA2, \argA2, d8
     veor.64 \argA3, \argA3, d9
     vshl.u64  d2, \argA2, #10
     vldr.64 d6, [sp, #\argA1]
     vshl.u64  d3, \argA3, #15
     veor.64 \argA4, \argA4, d10
     vsri.64  d2, \argA2, #64-10
     vsri.64  d3, \argA3, #64-15
     veor.64 \argA5, \argA5, d11
     vext.8  d4, \argA4, \argA4, #1
     vbic.64   \argA2, d3, d2
     vshl.u64  d0, \argA5, #27
     veor.64 d6, d6, d7
     vbic.64   \argA3, d4, d3
     vsri.64  d0, \argA5, #64-27
     vshl.u64  d1, d6, #36
     veor.64   \argA3, d2
     vbic.64   \argA4, d0, d4
     vsri.64  d1, d6, #64-36
 
     veor.64   \argA4, d3
     vbic.64   d6, d2, d1
     vbic.64   \argA5, d1, d0
     veor.64   d6, d0
     veor.64   \argA2, d1
     vstr.64   d6, [sp, #\argA1]
     veor.64  d5, d6
     veor.64   \argA5, d4
 
     .endm
 
 .macro    KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5
 
     @d3 = ROL64((argA1^Da), 41)@
     @d4 = ROL64((argA2^De), 2)@
     @d0 = ROL64((argA3^Di), 62)@
     @d1 = ROL64((argA4^Do), 55)@
     @d2 = ROL64((argA5^Du), 39)@
     @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
     @argA2 =   Be ^((~Bi)&  Bo )@
     @argA3 =   Bi ^((~Bo)&  Bu )@
     @argA4 =   Bo ^((~Bu)&  Ba )@
     @argA5 =   Bu ^((~Ba)&  Be )@
 
     veor.64 \argA2, \argA2, d8
     veor.64 \argA3, \argA3, d9
     vshl.u64  d4, \argA2, #2
     veor.64 \argA5, \argA5, d11
     vshl.u64  d0, \argA3, #62
     vldr.64 d6, [sp, #\argA1]
     vsri.64  d4, \argA2, #64-2
     veor.64 \argA4, \argA4, d10
     vsri.64  d0, \argA3, #64-62
 
     vshl.u64  d1, \argA4, #55
     veor.64 d6, d6, d7
     vshl.u64  d2, \argA5, #39
     vsri.64  d1, \argA4, #64-55
     vbic.64  \argA4, d0, d4
     vsri.64  d2, \argA5, #64-39
     vbic.64  \argA2, d1, d0
     vshl.u64  d3, d6, #41
     veor.64  \argA5, d4, \argA2
     vbic.64  \argA2, d2, d1
     vsri.64  d3, d6, #64-41
     veor.64  d6, d0, \argA2
 
     vbic.64 \argA2, d3, d2
     vbic.64 \argA3, d4, d3
     veor.64 \argA2, d1
     vstr.64 d6, [sp, #\argA1]
     veor.64 d5, d6
     veor.64 \argA3, d2
     veor.64 \argA4, d3
 
     .endm
 
 
 @// --- code
 
 @not callable from C!
 .p2align 3
 .type  KeccakF_armv7a_neon_asm,%function;
 KeccakF_armv7a_neon_asm:  @
 
 .LroundLoop:
 
     KeccakThetaRhoPiChiIota  Aba, d13, d19, d25, d31
     KeccakThetaRhoPiChi1    Aka, d15, d21, d22, d28
     KeccakThetaRhoPiChi2    Asa, d12, d18, d24, d30
     KeccakThetaRhoPiChi3    Aga, d14, d20, d26, d27
     KeccakThetaRhoPiChi4    Ama, d16, d17, d23, d29
 
     KeccakThetaRhoPiChiIota  Aba, d15, d18, d26, d29
     KeccakThetaRhoPiChi1    Asa, d14, d17, d25, d28
     KeccakThetaRhoPiChi2    Ama, d13, d21, d24, d27
     KeccakThetaRhoPiChi3    Aka, d12, d20, d23, d31
     KeccakThetaRhoPiChi4    Aga, d16, d19, d22, d30
 
     KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30
     KeccakThetaRhoPiChi1    Ama, d12, d19, d26, d28
     KeccakThetaRhoPiChi2    Aga, d15, d17, d24, d31
     KeccakThetaRhoPiChi3    Asa, d13, d20, d22, d29
     KeccakThetaRhoPiChi4    Aka, d16, d18, d25, d27
 
     KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27
     KeccakThetaRhoPiChi1    Aga, d13, d18, d23, d28
     KeccakThetaRhoPiChi2    Aka, d14, d19, d24, d29
     ldr    r0, [ip]
     KeccakThetaRhoPiChi3    Ama, d15, d20, d25, d30
     cmp    r0, #0xFFFFFFFF
     KeccakThetaRhoPiChi4    Asa, d16, d21, d26, d31
 
     bne    .LroundLoop
     sub    ip, #(8*24)
     bx    lr
 .p2align 2
 .ltorg
 .size KeccakF_armv7a_neon_asm,.-KeccakF_armv7a_neon_asm;
 
 
 @//unsigned _gcry_keccak_permute_armv7_neon(u64 *state)  callable from C
 .p2align 3
 .global   _gcry_keccak_permute_armv7_neon
 .type  _gcry_keccak_permute_armv7_neon,%function;
 _gcry_keccak_permute_armv7_neon:
 
     push   {ip, lr}
     vpush  {q4-q7}
     sub    sp,sp, #5*8
 
     vldr.64  d0,  [r0, #0*8]
     vldr.64  d12, [r0, #1*8]
     vldr.64  d17, [r0, #2*8]
     vldr.64  d22, [r0, #3*8]
     vldr.64  d27, [r0, #4*8]
 
     GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
 
     vldr.64  d1,  [r0, #5*8]
     vldr.64  d13, [r0, #6*8]
     vldr.64  d18, [r0, #7*8]
     vldr.64  d23, [r0, #8*8]
     vldr.64  d28, [r0, #9*8]
 
     vldr.64  d2,  [r0, #10*8]
     vldr.64  d14, [r0, #11*8]
     vldr.64  d19, [r0, #12*8]
     vldr.64  d24, [r0, #13*8]
     vldr.64  d29, [r0, #14*8]
 
     vldr.64  d3,  [r0, #15*8]
     vldr.64  d15, [r0, #16*8]
     vldr.64  d20, [r0, #17*8]
     vldr.64  d25, [r0, #18*8]
     vldr.64  d30, [r0, #19*8]
 
     vldr.64  d4,  [r0, #20*8]
     vldr.64  d16, [r0, #21*8]
     vldr.64  d21, [r0, #22*8]
     vldr.64  d26, [r0, #23*8]
     vldr.64  d31, [r0, #24*8]
 
     vstr.64  d0, [sp, #Aba]
     vstr.64  d1, [sp, #Aga]
     veor.64 q0, q0, q1
     vstr.64  d2, [sp, #Aka]
     veor.64 d5, d0,  d1
     vstr.64  d3, [sp, #Ama]
     mov      r1, r0
     vstr.64  d4, [sp, #Asa]
     veor.64 d5, d5,  d4
 
     bl KeccakF_armv7a_neon_asm
 
     vpop.64  { d0- d4 }
 
     vstr.64  d0,  [r1, #0*8]
     vstr.64  d12, [r1, #1*8]
     vstr.64  d17, [r1, #2*8]
     vstr.64  d22, [r1, #3*8]
     vstr.64  d27, [r1, #4*8]
 
     vstr.64  d1,  [r1, #5*8]
     vstr.64  d13, [r1, #6*8]
     vstr.64  d18, [r1, #7*8]
     vstr.64  d23, [r1, #8*8]
     vstr.64  d28, [r1, #9*8]
 
     vstr.64  d2,  [r1, #10*8]
     vstr.64  d14, [r1, #11*8]
     vstr.64  d19, [r1, #12*8]
     vstr.64  d24, [r1, #13*8]
     vstr.64  d29, [r1, #14*8]
 
     vstr.64  d3,  [r1, #15*8]
     vstr.64  d15, [r1, #16*8]
     vstr.64  d20, [r1, #17*8]
     vstr.64  d25, [r1, #18*8]
     vstr.64  d30, [r1, #19*8]
 
     vstr.64  d4,  [r1, #20*8]
     vstr.64  d16, [r1, #21*8]
     vstr.64  d21, [r1, #22*8]
     vstr.64  d26, [r1, #23*8]
     vstr.64  d31, [r1, #24*8]
 
     mov   r0, #112
     vpop  {q4-q7}
     pop   {ip, pc}
 .p2align 2
 .ltorg
 .size _gcry_keccak_permute_armv7_neon,.-_gcry_keccak_permute_armv7_neon;
 
-@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state, @r4
-@					    int pos,    @r1
-@					    const byte *lanes,   @r2
-@					    unsigned int nlanes, @r3
-@					    int blocklanes) @ r5 callable from C
+@//unsigned _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, @r4
+@						int pos,    @r1
+@						const byte *lanes,   @r2
+@						size_t nlanes, @r3
+@						int blocklanes) @ r5 callable from C
 .p2align 3
 .global   _gcry_keccak_absorb_lanes64_armv7_neon
 .type  _gcry_keccak_absorb_lanes64_armv7_neon,%function;
 _gcry_keccak_absorb_lanes64_armv7_neon:
 
     cmp    r3, #0	@ nlanes == 0
     itt eq
     moveq  r0, #0
     bxeq   lr
 
     push   {r4-r5, ip, lr}
     beq    .Lout
     mov    r4, r0
     ldr    r5, [sp, #(4*4)]
     vpush  {q4-q7}
 
     @ load state
     vldr.64  d0,  [r4, #0*8]
     vldr.64  d12, [r4, #1*8]
     vldr.64  d17, [r4, #2*8]
     vldr.64  d22, [r4, #3*8]
     vldr.64  d27, [r4, #4*8]
 
     GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
 
     vldr.64  d1,  [r4, #5*8]
     vldr.64  d13, [r4, #6*8]
     vldr.64  d18, [r4, #7*8]
     vldr.64  d23, [r4, #8*8]
     vldr.64  d28, [r4, #9*8]
 
     vldr.64  d2,  [r4, #10*8]
     vldr.64  d14, [r4, #11*8]
     vldr.64  d19, [r4, #12*8]
     vldr.64  d24, [r4, #13*8]
     vldr.64  d29, [r4, #14*8]
 
     vldr.64  d3,  [r4, #15*8]
     vldr.64  d15, [r4, #16*8]
     vldr.64  d20, [r4, #17*8]
     vldr.64  d25, [r4, #18*8]
     vldr.64  d30, [r4, #19*8]
 
     vldr.64  d4,  [r4, #20*8]
     vldr.64  d16, [r4, #21*8]
     vldr.64  d21, [r4, #22*8]
     vldr.64  d26, [r4, #23*8]
     vldr.64  d31, [r4, #24*8]
 
 .Lmain_loop:
 
     @ detect absorb mode (full blocks vs lanes)
 
     cmp r1, #0		@ pos != 0
     bne .Llanes_loop
 
 .Lmain_loop_pos0:
 
     @ full blocks mode
 
     @ switch (blocksize)
     cmp r5, #21
     beq .Lfull_block_21
     cmp r5, #18
     beq .Lfull_block_18
     cmp r5, #17
     beq .Lfull_block_17
     cmp r5, #13
     beq .Lfull_block_13
     cmp r5, #9
     beq .Lfull_block_9
 
     @ unknown blocksize
     b .Llanes_loop
 
 .Lfull_block_21:
 
     @ SHAKE128
 
     cmp r3, #21		@ nlanes < blocklanes
     blo .Llanes_loop
 
     sub    sp,sp, #5*8
 
     vld1.64 {d5-d8}, [r2]!
     veor d0,  d5
     vld1.64 {d9-d11}, [r2]!
     veor d12, d6
     veor d17, d7
     veor d22, d8
     vld1.64 {d5-d8}, [r2]!
     veor d27, d9
 
     veor d1,  d10
     veor d13, d11
     vld1.64 {d9-d11}, [r2]!
     veor d18, d5
     veor d23, d6
     veor d28, d7
 
     veor d2,  d8
     vld1.64 {d5-d8}, [r2]!
     veor d14, d9
     veor d19, d10
     veor d24, d11
     vld1.64 {d9-d11}, [r2]!
     veor d29, d5
 
     veor d3,  d6
     veor d15, d7
     veor d20, d8
     veor d25, d9
     veor d30, d10
 
     veor d4,  d11
 
     vstr.64  d0, [sp, #Aba]
     vstr.64  d1, [sp, #Aga]
     veor.64 q0, q0, q1
     vstr.64  d2, [sp, #Aka]
     veor.64 d5, d0,  d1
     vstr.64  d3, [sp, #Ama]
     vstr.64  d4, [sp, #Asa]
     veor.64 d5, d5,  d4
 
     bl KeccakF_armv7a_neon_asm
 
     subs r3, #21	@ nlanes -= 21
     vpop.64  { d0-d4 }
 
     beq .Ldone
 
     b .Lfull_block_21
 
 .Lfull_block_18:
 
     @ SHA3-224
 
     cmp r3, #18		@ nlanes < blocklanes
     blo .Llanes_loop
 
     sub    sp,sp, #5*8
 
     vld1.64 {d5-d8}, [r2]!
     veor d0,  d5
     vld1.64 {d9-d11}, [r2]!
     veor d12, d6
     veor d17, d7
     veor d22, d8
     vld1.64 {d5-d8}, [r2]!
     veor d27, d9
 
     veor d1,  d10
     veor d13, d11
     vld1.64 {d9-d11}, [r2]!
     veor d18, d5
     veor d23, d6
     veor d28, d7
 
     veor d2,  d8
     vld1.64 {d5-d8}, [r2]!
     veor d14, d9
     veor d19, d10
     veor d24, d11
     veor d29, d5
 
     veor d3,  d6
     veor d15, d7
     veor d20, d8
 
     vstr.64  d0, [sp, #Aba]
     vstr.64  d1, [sp, #Aga]
     veor.64 q0, q0, q1
     vstr.64  d2, [sp, #Aka]
     veor.64 d5, d0,  d1
     vstr.64  d3, [sp, #Ama]
     vstr.64  d4, [sp, #Asa]
     veor.64 d5, d5,  d4
 
     bl KeccakF_armv7a_neon_asm
 
     subs r3, #18	@ nlanes -= 18
     vpop.64  { d0-d4 }
 
     beq .Ldone
 
     b .Lfull_block_18
 
 .Lfull_block_17:
 
     @ SHA3-256 & SHAKE256
 
     cmp r3, #17		@ nlanes < blocklanes
     blo .Llanes_loop
 
     sub    sp,sp, #5*8
 
     vld1.64 {d5-d8}, [r2]!
     veor d0,  d5
     vld1.64 {d9-d11}, [r2]!
     veor d12, d6
     veor d17, d7
     veor d22, d8
     vld1.64 {d5-d8}, [r2]!
     veor d27, d9
 
     veor d1,  d10
     veor d13, d11
     vld1.64 {d9-d11}, [r2]!
     veor d18, d5
     veor d23, d6
     veor d28, d7
 
     veor d2,  d8
     vld1.64 {d5-d7}, [r2]!
     veor d14, d9
     veor d19, d10
     veor d24, d11
     veor d29, d5
 
     veor d3,  d6
     veor d15, d7
 
     vstr.64  d0, [sp, #Aba]
     vstr.64  d1, [sp, #Aga]
     veor.64 q0, q0, q1
     vstr.64  d2, [sp, #Aka]
     veor.64 d5, d0,  d1
     vstr.64  d3, [sp, #Ama]
     vstr.64  d4, [sp, #Asa]
     veor.64 d5, d5,  d4
 
     bl KeccakF_armv7a_neon_asm
 
     subs r3, #17	@ nlanes -= 17
     vpop.64  { d0-d4 }
 
     beq .Ldone
 
     b .Lfull_block_17
 
 .Lfull_block_13:
 
     @ SHA3-384
 
     cmp r3, #13		@ nlanes < blocklanes
     blo .Llanes_loop
 
     sub    sp,sp, #5*8
 
     vld1.64 {d5-d8}, [r2]!
     veor d0,  d5
     vld1.64 {d9-d11}, [r2]!
     veor d12, d6
     veor d17, d7
     veor d22, d8
     vld1.64 {d5-d8}, [r2]!
     veor d27, d9
 
     veor d1,  d10
     veor d13, d11
     vld1.64 {d9-d10}, [r2]!
     veor d18, d5
     veor d23, d6
     veor d28, d7
 
     veor d2,  d8
     veor d14, d9
     veor d19, d10
 
     vstr.64  d0, [sp, #Aba]
     vstr.64  d1, [sp, #Aga]
     veor.64 q0, q0, q1
     vstr.64  d2, [sp, #Aka]
     veor.64 d5, d0,  d1
     vstr.64  d3, [sp, #Ama]
     vstr.64  d4, [sp, #Asa]
     veor.64 d5, d5,  d4
 
     bl KeccakF_armv7a_neon_asm
 
     subs r3, #13	@ nlanes -= 13
     vpop.64  { d0-d4 }
 
     beq .Ldone
 
     b .Lfull_block_13
 
 .Lfull_block_9:
 
     @ SHA3-512
 
     cmp r3, #9		@ nlanes < blocklanes
     blo .Llanes_loop
 
     sub    sp,sp, #5*8
 
     vld1.64 {d5-d8}, [r2]!
     veor d0,  d5
     vld1.64 {d9-d11}, [r2]!
     veor d12, d6
     veor d17, d7
     veor d22, d8
     vld1.64 {d5-d6}, [r2]!
     veor d27, d9
 
     veor d1,  d10
     veor d13, d11
     veor d18, d5
     veor d23, d6
 
     vstr.64  d0, [sp, #Aba]
     vstr.64  d1, [sp, #Aga]
     veor.64 q0, q0, q1
     vstr.64  d2, [sp, #Aka]
     veor.64 d5, d0,  d1
     vstr.64  d3, [sp, #Ama]
     vstr.64  d4, [sp, #Asa]
     veor.64 d5, d5,  d4
 
     bl KeccakF_armv7a_neon_asm
 
     subs r3, #9		@ nlanes -= 9
     vpop.64  { d0-d4 }
 
     beq .Ldone
 
     b .Lfull_block_9
 
 .Llanes_loop:
 
     @ per-lane mode
 
     @ switch (pos)
     ldrb r0, [pc, r1]
     add pc, pc, r0, lsl #2
 .Lswitch_table:
     .byte (.Llane0-.Lswitch_table-4)/4
     .byte (.Llane1-.Lswitch_table-4)/4
     .byte (.Llane2-.Lswitch_table-4)/4
     .byte (.Llane3-.Lswitch_table-4)/4
     .byte (.Llane4-.Lswitch_table-4)/4
     .byte (.Llane5-.Lswitch_table-4)/4
     .byte (.Llane6-.Lswitch_table-4)/4
     .byte (.Llane7-.Lswitch_table-4)/4
     .byte (.Llane8-.Lswitch_table-4)/4
     .byte (.Llane9-.Lswitch_table-4)/4
     .byte (.Llane10-.Lswitch_table-4)/4
     .byte (.Llane11-.Lswitch_table-4)/4
     .byte (.Llane12-.Lswitch_table-4)/4
     .byte (.Llane13-.Lswitch_table-4)/4
     .byte (.Llane14-.Lswitch_table-4)/4
     .byte (.Llane15-.Lswitch_table-4)/4
     .byte (.Llane16-.Lswitch_table-4)/4
     .byte (.Llane17-.Lswitch_table-4)/4
     .byte (.Llane18-.Lswitch_table-4)/4
     .byte (.Llane19-.Lswitch_table-4)/4
     .byte (.Llane20-.Lswitch_table-4)/4
     .byte (.Llane21-.Lswitch_table-4)/4
     .byte (.Llane22-.Lswitch_table-4)/4
     .byte (.Llane23-.Lswitch_table-4)/4
     .byte (.Llane24-.Lswitch_table-4)/4
 .p2align 2
 
 #define ABSORB_LANE(label, vreg) \
     label: \
       add     r1, #1; \
       vld1.64 d5, [r2]!; \
       cmp     r1, r5; /* pos == blocklanes */ \
       veor    vreg, vreg, d5; \
       beq     .Llanes_permute; \
       subs    r3, #1; \
       beq     .Ldone;
 
     ABSORB_LANE(.Llane0, d0)
     ABSORB_LANE(.Llane1, d12)
     ABSORB_LANE(.Llane2, d17)
     ABSORB_LANE(.Llane3, d22)
     ABSORB_LANE(.Llane4, d27)
 
     ABSORB_LANE(.Llane5, d1)
     ABSORB_LANE(.Llane6, d13)
     ABSORB_LANE(.Llane7, d18)
     ABSORB_LANE(.Llane8, d23)
     ABSORB_LANE(.Llane9, d28)
 
     ABSORB_LANE(.Llane10, d2)
     ABSORB_LANE(.Llane11, d14)
     ABSORB_LANE(.Llane12, d19)
     ABSORB_LANE(.Llane13, d24)
     ABSORB_LANE(.Llane14, d29)
 
     ABSORB_LANE(.Llane15, d3)
     ABSORB_LANE(.Llane16, d15)
     ABSORB_LANE(.Llane17, d20)
     ABSORB_LANE(.Llane18, d25)
     ABSORB_LANE(.Llane19, d30)
 
     ABSORB_LANE(.Llane20, d4)
     ABSORB_LANE(.Llane21, d16)
     ABSORB_LANE(.Llane22, d21)
     ABSORB_LANE(.Llane23, d26)
     ABSORB_LANE(.Llane24, d31)
 
     b .Llanes_loop
 
 .Llanes_permute:
 
     sub    sp,sp, #5*8
     vstr.64  d0, [sp, #Aba]
     vstr.64  d1, [sp, #Aga]
     veor.64 q0, q0, q1
     vstr.64  d2, [sp, #Aka]
     veor.64 d5, d0,  d1
     vstr.64  d3, [sp, #Ama]
     vstr.64  d4, [sp, #Asa]
     veor.64 d5, d5,  d4
 
     bl KeccakF_armv7a_neon_asm
 
     mov  r1, #0   @ pos <= 0
     subs r3, #1
 
     vpop.64  { d0-d4 }
 
     beq  .Ldone
 
     b .Lmain_loop_pos0
 
 .Ldone:
 
     @ save state
     vstr.64  d0,  [r4, #0*8]
     vstr.64  d12, [r4, #1*8]
     vstr.64  d17, [r4, #2*8]
     vstr.64  d22, [r4, #3*8]
     vstr.64  d27, [r4, #4*8]
 
     vstr.64  d1,  [r4, #5*8]
     vstr.64  d13, [r4, #6*8]
     vstr.64  d18, [r4, #7*8]
     vstr.64  d23, [r4, #8*8]
     vstr.64  d28, [r4, #9*8]
 
     vstr.64  d2,  [r4, #10*8]
     vstr.64  d14, [r4, #11*8]
     vstr.64  d19, [r4, #12*8]
     vstr.64  d24, [r4, #13*8]
     vstr.64  d29, [r4, #14*8]
 
     vstr.64  d3,  [r4, #15*8]
     vstr.64  d15, [r4, #16*8]
     vstr.64  d20, [r4, #17*8]
     vstr.64  d25, [r4, #18*8]
     vstr.64  d30, [r4, #19*8]
 
     vstr.64  d4,  [r4, #20*8]
     vstr.64  d16, [r4, #21*8]
     vstr.64  d21, [r4, #22*8]
     vstr.64  d26, [r4, #23*8]
     vstr.64  d31, [r4, #24*8]
 
     mov   r0, #120
     vpop  {q4-q7}
 .Lout:
     pop   {r4-r5, ip, pc}
 .p2align 2
 .ltorg
 .size _gcry_keccak_absorb_lanes64_armv7_neon,.-_gcry_keccak_absorb_lanes64_armv7_neon;
 
 #endif
diff --git a/cipher/keccak.c b/cipher/keccak.c
index e7e42473..6c385f71 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -1,1644 +1,1644 @@
 /* keccak.c - SHA3 hash functions
  * Copyright (C) 2015  g10 Code GmbH
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 
 #include <config.h>
 #include <string.h>
 #include "g10lib.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
 
 
 /* USE_64BIT indicates whether to use 64-bit generic implementation.
  * USE_32BIT indicates whether to use 32-bit generic implementation. */
 #undef USE_64BIT
 #if defined(__x86_64__) || SIZEOF_UNSIGNED_LONG == 8
 # define USE_64BIT 1
 #else
 # define USE_32BIT 1
 #endif
 
 
 /* USE_64BIT_BMI2 indicates whether to compile with 64-bit Intel BMI2 code. */
 #undef USE_64BIT_BMI2
 #if defined(USE_64BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_CPU_ARCH_X86)
 # define USE_64BIT_BMI2 1
 #endif
 
 
 /* USE_64BIT_SHLD indicates whether to compile with 64-bit Intel SHLD code. */
 #undef USE_64BIT_SHLD
 #if defined(USE_64BIT) && defined (__GNUC__) && defined(__x86_64__) && \
     defined(HAVE_CPU_ARCH_X86)
 # define USE_64BIT_SHLD 1
 #endif
 
 
 /* USE_32BIT_BMI2 indicates whether to compile with 32-bit Intel BMI2 code. */
 #undef USE_32BIT_BMI2
 #if defined(USE_32BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_CPU_ARCH_X86)
 # define USE_32BIT_BMI2 1
 #endif
 
 
 /* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */
 #undef USE_64BIT_AVX512
 #if defined(USE_64BIT) && defined(__x86_64__) && \
     defined(HAVE_GCC_INLINE_ASM_AVX512) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_64BIT_AVX512 1
 #endif
 
 
 /* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
  * code. */
 #undef USE_64BIT_ARM_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_64BIT_ARM_NEON 1
 # endif
 #endif /*ENABLE_NEON_SUPPORT*/
 
 
 /* USE_S390X_CRYPTO indicates whether to enable zSeries code. */
 #undef USE_S390X_CRYPTO
 #if defined(HAVE_GCC_INLINE_ASM_S390X)
 # define USE_S390X_CRYPTO 1
 #endif /* USE_S390X_CRYPTO */
 
 
 /* x86-64 vector register assembly implementations use SystemV ABI, ABI
  * conversion needed on Win64 through function attribute. */
 #undef ASM_FUNC_ABI
 #if defined(USE_64BIT_AVX512) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
 # define ASM_FUNC_ABI __attribute__((sysv_abi))
 #else
 # define ASM_FUNC_ABI
 #endif
 
 
 #if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
 # define NEED_COMMON64 1
 #endif
 
 #ifdef USE_32BIT
 # define NEED_COMMON32BI 1
 #endif
 
 
 #define SHA3_DELIMITED_SUFFIX 0x06
 #define SHAKE_DELIMITED_SUFFIX 0x1F
 
 
 typedef struct
 {
   union {
 #ifdef NEED_COMMON64
     u64 state64[25];
 #endif
 #ifdef NEED_COMMON32BI
     u32 state32bi[50];
 #endif
   } u;
 } KECCAK_STATE;
 
 
 typedef struct
 {
   unsigned int (*permute)(KECCAK_STATE *hd);
   unsigned int (*absorb)(KECCAK_STATE *hd, int pos, const byte *lanes,
-			 unsigned int nlanes, int blocklanes);
+			 size_t nlanes, int blocklanes);
   unsigned int (*extract) (KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
 			   unsigned int outlen);
 } keccak_ops_t;
 
 
 typedef struct KECCAK_CONTEXT_S
 {
   KECCAK_STATE state;
   unsigned int outlen;
   unsigned int blocksize;
   unsigned int count;
   unsigned int suffix;
   const keccak_ops_t *ops;
 #ifdef USE_S390X_CRYPTO
   unsigned int kimd_func;
   unsigned int buf_pos;
   byte buf[1344 / 8]; /* SHAKE128 requires biggest buffer, 1344 bits. */
 #endif
 } KECCAK_CONTEXT;
 
 
 
 #ifdef NEED_COMMON64
 
 const u64 _gcry_keccak_round_consts_64bit[24 + 1] =
 {
   U64_C(0x0000000000000001), U64_C(0x0000000000008082),
   U64_C(0x800000000000808A), U64_C(0x8000000080008000),
   U64_C(0x000000000000808B), U64_C(0x0000000080000001),
   U64_C(0x8000000080008081), U64_C(0x8000000000008009),
   U64_C(0x000000000000008A), U64_C(0x0000000000000088),
   U64_C(0x0000000080008009), U64_C(0x000000008000000A),
   U64_C(0x000000008000808B), U64_C(0x800000000000008B),
   U64_C(0x8000000000008089), U64_C(0x8000000000008003),
   U64_C(0x8000000000008002), U64_C(0x8000000000000080),
   U64_C(0x000000000000800A), U64_C(0x800000008000000A),
   U64_C(0x8000000080008081), U64_C(0x8000000000008080),
   U64_C(0x0000000080000001), U64_C(0x8000000080008008),
   U64_C(0xFFFFFFFFFFFFFFFF)
 };
 
 static unsigned int
 keccak_extract64(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
 		 unsigned int outlen)
 {
   unsigned int i;
 
   /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
 
   for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
     {
       u64 tmp = hd->u.state64[i];
       buf_put_le64(outbuf, tmp);
       outbuf += 8;
     }
 
   return 0;
 }
 
 #endif /* NEED_COMMON64 */
 
 
 #ifdef NEED_COMMON32BI
 
 static const u32 round_consts_32bit[2 * 24] =
 {
   0x00000001UL, 0x00000000UL, 0x00000000UL, 0x00000089UL,
   0x00000000UL, 0x8000008bUL, 0x00000000UL, 0x80008080UL,
   0x00000001UL, 0x0000008bUL, 0x00000001UL, 0x00008000UL,
   0x00000001UL, 0x80008088UL, 0x00000001UL, 0x80000082UL,
   0x00000000UL, 0x0000000bUL, 0x00000000UL, 0x0000000aUL,
   0x00000001UL, 0x00008082UL, 0x00000000UL, 0x00008003UL,
   0x00000001UL, 0x0000808bUL, 0x00000001UL, 0x8000000bUL,
   0x00000001UL, 0x8000008aUL, 0x00000001UL, 0x80000081UL,
   0x00000000UL, 0x80000081UL, 0x00000000UL, 0x80000008UL,
   0x00000000UL, 0x00000083UL, 0x00000000UL, 0x80008003UL,
   0x00000001UL, 0x80008088UL, 0x00000000UL, 0x80000088UL,
   0x00000001UL, 0x00008000UL, 0x00000000UL, 0x80008082UL
 };
 
 static unsigned int
 keccak_extract32bi(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
 		   unsigned int outlen)
 {
   unsigned int i;
   u32 x0;
   u32 x1;
   u32 t;
 
   /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
 
   for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
     {
       x0 = hd->u.state32bi[i * 2 + 0];
       x1 = hd->u.state32bi[i * 2 + 1];
 
       t = (x0 & 0x0000FFFFUL) + (x1 << 16);
       x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL);
       x0 = t;
       t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8);
       t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4);
       t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2);
       t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1);
       t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8);
       t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4);
       t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2);
       t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1);
 
       buf_put_le32(&outbuf[0], x0);
       buf_put_le32(&outbuf[4], x1);
       outbuf += 8;
     }
 
   return 0;
 }
 
 static inline void
 keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1)
 {
   u32 t;
 
   t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1);
   t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2);
   t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4);
   t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8);
   t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1);
   t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2);
   t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4);
   t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8);
   lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16);
   lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL);
 }
 
 #endif /* NEED_COMMON32BI */
 
 
 /* Construct generic 64-bit implementation. */
 #ifdef USE_64BIT
 
 #if __GNUC__ >= 4 && defined(__x86_64__)
 
 static inline void absorb_lanes64_8(u64 *dst, const byte *in)
 {
   asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
        "movdqu 0*16(%[in]), %%xmm4\n\t"
        "movdqu 1*16(%[dst]), %%xmm1\n\t"
        "movdqu 1*16(%[in]), %%xmm5\n\t"
        "movdqu 2*16(%[dst]), %%xmm2\n\t"
        "movdqu 3*16(%[dst]), %%xmm3\n\t"
        "pxor %%xmm4, %%xmm0\n\t"
        "pxor %%xmm5, %%xmm1\n\t"
        "movdqu 2*16(%[in]), %%xmm4\n\t"
        "movdqu 3*16(%[in]), %%xmm5\n\t"
        "movdqu %%xmm0, 0*16(%[dst])\n\t"
        "pxor %%xmm4, %%xmm2\n\t"
        "movdqu %%xmm1, 1*16(%[dst])\n\t"
        "pxor %%xmm5, %%xmm3\n\t"
        "movdqu %%xmm2, 2*16(%[dst])\n\t"
        "movdqu %%xmm3, 3*16(%[dst])\n\t"
        :
        : [dst] "r" (dst), [in] "r" (in)
        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
 }
 
 static inline void absorb_lanes64_4(u64 *dst, const byte *in)
 {
   asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
        "movdqu 0*16(%[in]), %%xmm4\n\t"
        "movdqu 1*16(%[dst]), %%xmm1\n\t"
        "movdqu 1*16(%[in]), %%xmm5\n\t"
        "pxor %%xmm4, %%xmm0\n\t"
        "pxor %%xmm5, %%xmm1\n\t"
        "movdqu %%xmm0, 0*16(%[dst])\n\t"
        "movdqu %%xmm1, 1*16(%[dst])\n\t"
        :
        : [dst] "r" (dst), [in] "r" (in)
        : "xmm0", "xmm1", "xmm4", "xmm5", "memory");
 }
 
 static inline void absorb_lanes64_2(u64 *dst, const byte *in)
 {
   asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
        "movdqu 0*16(%[in]), %%xmm4\n\t"
        "pxor %%xmm4, %%xmm0\n\t"
        "movdqu %%xmm0, 0*16(%[dst])\n\t"
        :
        : [dst] "r" (dst), [in] "r" (in)
        : "xmm0", "xmm4", "memory");
 }
 
 #else /* __x86_64__ */
 
 static inline void absorb_lanes64_8(u64 *dst, const byte *in)
 {
   dst[0] ^= buf_get_le64(in + 8 * 0);
   dst[1] ^= buf_get_le64(in + 8 * 1);
   dst[2] ^= buf_get_le64(in + 8 * 2);
   dst[3] ^= buf_get_le64(in + 8 * 3);
   dst[4] ^= buf_get_le64(in + 8 * 4);
   dst[5] ^= buf_get_le64(in + 8 * 5);
   dst[6] ^= buf_get_le64(in + 8 * 6);
   dst[7] ^= buf_get_le64(in + 8 * 7);
 }
 
 static inline void absorb_lanes64_4(u64 *dst, const byte *in)
 {
   dst[0] ^= buf_get_le64(in + 8 * 0);
   dst[1] ^= buf_get_le64(in + 8 * 1);
   dst[2] ^= buf_get_le64(in + 8 * 2);
   dst[3] ^= buf_get_le64(in + 8 * 3);
 }
 
 static inline void absorb_lanes64_2(u64 *dst, const byte *in)
 {
   dst[0] ^= buf_get_le64(in + 8 * 0);
   dst[1] ^= buf_get_le64(in + 8 * 1);
 }
 
 #endif /* !__x86_64__ */
 
 static inline void absorb_lanes64_1(u64 *dst, const byte *in)
 {
   dst[0] ^= buf_get_le64(in + 8 * 0);
 }
 
 
 # define ANDN64(x, y) (~(x) & (y))
 # define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
 		      ((x) >> ((64 - (unsigned int)(n)) & 63)))
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64
 # define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
 # undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_generic64_ops =
 {
   .permute = keccak_f1600_state_permute64,
   .absorb = keccak_absorb_lanes64,
   .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT */
 
 
 /* Construct 64-bit Intel SHLD implementation. */
 #ifdef USE_64BIT_SHLD
 
 # define ANDN64(x, y) (~(x) & (y))
 # define ROL64(x, n) ({ \
 			u64 tmp = (x); \
 			asm ("shldq %1, %0, %0" \
 			     : "+r" (tmp) \
 			     : "J" ((n) & 63) \
 			     : "cc"); \
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld
 # define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
 # undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_shld_64_ops =
 {
   .permute = keccak_f1600_state_permute64_shld,
   .absorb = keccak_absorb_lanes64_shld,
   .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT_SHLD */
 
 
 /* Construct 64-bit Intel BMI2 implementation. */
 #ifdef USE_64BIT_BMI2
 
 # define ANDN64(x, y) ({ \
 			u64 tmp; \
 			asm ("andnq %2, %1, %0" \
 			     : "=r" (tmp) \
 			     : "r0" (x), "rm" (y)); \
 			tmp; })
 
 # define ROL64(x, n) ({ \
 			u64 tmp; \
 			asm ("rorxq %2, %1, %0" \
 			     : "=r" (tmp) \
 			     : "rm0" (x), "J" (64 - ((n) & 63))); \
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2
 # define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
 # undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_bmi2_64_ops =
 {
   .permute = keccak_f1600_state_permute64_bmi2,
   .absorb = keccak_absorb_lanes64_bmi2,
   .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT_BMI2 */
 
 
 /* 64-bit Intel AVX512 implementation. */
 #ifdef USE_64BIT_AVX512
 
 extern ASM_FUNC_ABI unsigned int
 _gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst);
 
 extern ASM_FUNC_ABI unsigned int
 _gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst,
                                   const byte *lanes, size_t nlanes,
                                   size_t blocklanes, const byte **new_lanes);
 
 static unsigned int
 keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd)
 {
   return _gcry_keccak_f1600_state_permute64_avx512 (
                                 hd->u.state64, _gcry_keccak_round_consts_64bit);
 }
 
 static unsigned int
 keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes,
-			     unsigned int nlanes, int blocklanes)
+			     size_t nlanes, int blocklanes)
 {
   while (nlanes)
     {
-      if (pos == 0 && blocklanes > 0 && nlanes >= (unsigned int)blocklanes)
+      if (pos == 0 && blocklanes > 0 && nlanes >= (size_t)blocklanes)
         {
           nlanes = _gcry_keccak_absorb_blocks_avx512 (
                             hd->u.state64, _gcry_keccak_round_consts_64bit,
                             lanes, nlanes, blocklanes, &lanes);
         }
 
       while (nlanes)
 	{
 	  hd->u.state64[pos] ^= buf_get_le64 (lanes);
 	  lanes += 8;
 	  nlanes--;
 
 	  if (++pos == blocklanes)
 	    {
 	      keccak_f1600_state_permute64_avx512 (hd);
 	      pos = 0;
 	      break;
 	    }
 	}
     }
 
   return 0;
 }
 
 static const keccak_ops_t keccak_avx512_64_ops =
 {
   .permute = keccak_f1600_state_permute64_avx512,
   .absorb = keccak_absorb_lanes64_avx512,
   .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT_AVX512 */
 
 
 /* 64-bit ARMv7/NEON implementation. */
 #ifdef USE_64BIT_ARM_NEON
 
 unsigned int _gcry_keccak_permute_armv7_neon(u64 *state);
 unsigned int _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, int pos,
 						    const byte *lanes,
-						    unsigned int nlanes,
+						    size_t nlanes,
 						    int blocklanes);
 
 static unsigned int keccak_permute64_armv7_neon(KECCAK_STATE *hd)
 {
   return _gcry_keccak_permute_armv7_neon(hd->u.state64);
 }
 
 static unsigned int
 keccak_absorb_lanes64_armv7_neon(KECCAK_STATE *hd, int pos, const byte *lanes,
-				 unsigned int nlanes, int blocklanes)
+				 size_t nlanes, int blocklanes)
 {
   if (blocklanes < 0)
     {
       /* blocklanes == -1, permutationless absorb from keccak_final. */
 
       while (nlanes)
 	{
 	  hd->u.state64[pos] ^= buf_get_le64(lanes);
 	  lanes += 8;
 	  nlanes--;
 	}
 
       return 0;
     }
   else
     {
       return _gcry_keccak_absorb_lanes64_armv7_neon(hd->u.state64, pos, lanes,
 						    nlanes, blocklanes);
     }
 }
 
 static const keccak_ops_t keccak_armv7_neon_64_ops =
 {
   .permute = keccak_permute64_armv7_neon,
   .absorb = keccak_absorb_lanes64_armv7_neon,
   .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT_ARM_NEON */
 
 
 /* Construct generic 32-bit implementation. */
 #ifdef USE_32BIT
 
 # define ANDN32(x, y) (~(x) & (y))
 # define ROL32(x, n) (((x) << ((unsigned int)n & 31)) | \
 		      ((x) >> ((32 - (unsigned int)(n)) & 31)))
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi
 # include "keccak_permute_32.h"
 
 # undef ANDN32
 # undef ROL32
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
 
 static unsigned int
 keccak_absorb_lanes32bi(KECCAK_STATE *hd, int pos, const byte *lanes,
-		        unsigned int nlanes, int blocklanes)
+		        size_t nlanes, int blocklanes)
 {
   unsigned int burn = 0;
 
   while (nlanes)
     {
       keccak_absorb_lane32bi(&hd->u.state32bi[pos * 2],
 			     buf_get_le32(lanes + 0),
 			     buf_get_le32(lanes + 4));
       lanes += 8;
       nlanes--;
 
       if (++pos == blocklanes)
 	{
 	  burn = keccak_f1600_state_permute32bi(hd);
 	  pos = 0;
 	}
     }
 
   return burn;
 }
 
 static const keccak_ops_t keccak_generic32bi_ops =
 {
   .permute = keccak_f1600_state_permute32bi,
   .absorb = keccak_absorb_lanes32bi,
   .extract = keccak_extract32bi,
 };
 
 #endif /* USE_32BIT */
 
 
 /* Construct 32-bit Intel BMI2 implementation. */
 #ifdef USE_32BIT_BMI2
 
 # define ANDN32(x, y) ({ \
 			u32 tmp; \
 			asm ("andnl %2, %1, %0" \
 			     : "=r" (tmp) \
 			     : "r0" (x), "rm" (y)); \
 			tmp; })
 
 # define ROL32(x, n) ({ \
 			u32 tmp; \
 			asm ("rorxl %2, %1, %0" \
 			     : "=r" (tmp) \
 			     : "rm0" (x), "J" (32 - ((n) & 31))); \
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi_bmi2
 # include "keccak_permute_32.h"
 
 # undef ANDN32
 # undef ROL32
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
 
 static inline u32 pext(u32 x, u32 mask)
 {
   u32 tmp;
   asm ("pextl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask));
   return tmp;
 }
 
 static inline u32 pdep(u32 x, u32 mask)
 {
   u32 tmp;
   asm ("pdepl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask));
   return tmp;
 }
 
 static inline void
 keccak_absorb_lane32bi_bmi2(u32 *lane, u32 x0, u32 x1)
 {
   x0 = pdep(pext(x0, 0x55555555), 0x0000ffff) | (pext(x0, 0xaaaaaaaa) << 16);
   x1 = pdep(pext(x1, 0x55555555), 0x0000ffff) | (pext(x1, 0xaaaaaaaa) << 16);
 
   lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16);
   lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL);
 }
 
 static unsigned int
 keccak_absorb_lanes32bi_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
-		             unsigned int nlanes, int blocklanes)
+		             size_t nlanes, int blocklanes)
 {
   unsigned int burn = 0;
 
   while (nlanes)
     {
       keccak_absorb_lane32bi_bmi2(&hd->u.state32bi[pos * 2],
 			          buf_get_le32(lanes + 0),
 			          buf_get_le32(lanes + 4));
       lanes += 8;
       nlanes--;
 
       if (++pos == blocklanes)
 	{
 	  burn = keccak_f1600_state_permute32bi_bmi2(hd);
 	  pos = 0;
 	}
     }
 
   return burn;
 }
 
 static unsigned int
 keccak_extract32bi_bmi2(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
 			unsigned int outlen)
 {
   unsigned int i;
   u32 x0;
   u32 x1;
   u32 t;
 
   /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
 
   for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
     {
       x0 = hd->u.state32bi[i * 2 + 0];
       x1 = hd->u.state32bi[i * 2 + 1];
 
       t = (x0 & 0x0000FFFFUL) + (x1 << 16);
       x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL);
       x0 = t;
 
       x0 = pdep(pext(x0, 0xffff0001), 0xaaaaaaab) | pdep(x0 >> 1, 0x55555554);
       x1 = pdep(pext(x1, 0xffff0001), 0xaaaaaaab) | pdep(x1 >> 1, 0x55555554);
 
       buf_put_le32(&outbuf[0], x0);
       buf_put_le32(&outbuf[4], x1);
       outbuf += 8;
     }
 
   return 0;
 }
 
 static const keccak_ops_t keccak_bmi2_32bi_ops =
 {
   .permute = keccak_f1600_state_permute32bi_bmi2,
   .absorb = keccak_absorb_lanes32bi_bmi2,
   .extract = keccak_extract32bi_bmi2,
 };
 
 #endif /* USE_32BIT_BMI2 */
 
 
 #ifdef USE_S390X_CRYPTO
 #include "asm-inline-s390x.h"
 
 static inline void
 keccak_bwrite_s390x (void *context, const byte *in, size_t inlen)
 {
   KECCAK_CONTEXT *ctx = context;
 
   /* Write full-blocks. */
   kimd_execute (ctx->kimd_func, &ctx->state, in, inlen);
   return;
 }
 
 static inline void
 keccak_final_s390x (void *context)
 {
   KECCAK_CONTEXT *ctx = context;
 
   if (ctx->suffix == SHA3_DELIMITED_SUFFIX)
     {
       klmd_execute (ctx->kimd_func, &ctx->state, ctx->buf, ctx->count);
     }
   else
     {
       klmd_shake_execute (ctx->kimd_func, &ctx->state, NULL, 0, ctx->buf,
 			  ctx->count);
       ctx->count = 0;
       ctx->buf_pos = 0;
     }
 
   return;
 }
 
 static inline void
 keccak_bextract_s390x (void *context, byte *out, size_t outlen)
 {
   KECCAK_CONTEXT *ctx = context;
 
   /* Extract full-blocks. */
   klmd_shake_execute (ctx->kimd_func | KLMD_PADDING_STATE, &ctx->state,
 		      out, outlen, NULL, 0);
   return;
 }
 
 static void
 keccak_write_s390x (void *context, const byte *inbuf, size_t inlen)
 {
   KECCAK_CONTEXT *hd = context;
   const size_t blocksize = hd->blocksize;
   size_t inblocks;
   size_t copylen;
 
   while (hd->count)
     {
       if (hd->count == blocksize)  /* Flush the buffer. */
 	{
 	  keccak_bwrite_s390x (hd, hd->buf, blocksize);
 	  hd->count = 0;
 	}
       else
 	{
 	  copylen = inlen;
 	  if (copylen > blocksize - hd->count)
 	    copylen = blocksize - hd->count;
 
 	  if (copylen == 0)
 	    break;
 
 	  buf_cpy (&hd->buf[hd->count], inbuf, copylen);
 	  hd->count += copylen;
 	  inbuf += copylen;
 	  inlen -= copylen;
 	}
     }
 
   if (inlen == 0)
     return;
 
   if (inlen >= blocksize)
     {
       inblocks = inlen / blocksize;
       keccak_bwrite_s390x (hd, inbuf, inblocks * blocksize);
       hd->count = 0;
       inlen -= inblocks * blocksize;
       inbuf += inblocks * blocksize;
     }
 
   if (inlen)
     {
       buf_cpy (hd->buf, inbuf, inlen);
       hd->count = inlen;
     }
 }
 
 static void
 keccak_extract_s390x (void *context, void *outbuf_arg, size_t outlen)
 {
   KECCAK_CONTEXT *hd = context;
   const size_t blocksize = hd->blocksize;
   byte *outbuf = outbuf_arg;
 
   while (outlen)
     {
       gcry_assert(hd->count == 0 || hd->buf_pos < hd->count);
 
       if (hd->buf_pos < hd->count && outlen)
 	{
 	  size_t copylen = hd->count - hd->buf_pos;
 
 	  if (copylen > outlen)
 	    copylen = outlen;
 
 	  buf_cpy (outbuf, &hd->buf[hd->buf_pos], copylen);
 
 	  outbuf += copylen;
 	  outlen -= copylen;
 	  hd->buf_pos += copylen;
 	}
 
       if (hd->buf_pos == hd->count)
 	{
 	  hd->buf_pos = 0;
 	  hd->count = 0;
 	}
 
       if (outlen == 0)
 	return;
 
       if (outlen >= blocksize)
 	{
 	  size_t outblocks = outlen / blocksize;
 
 	  keccak_bextract_s390x (context, outbuf, outblocks * blocksize);
 
 	  outlen -= outblocks * blocksize;
 	  outbuf += outblocks * blocksize;
 
 	  if (outlen == 0)
 	    return;
 	}
 
       keccak_bextract_s390x (context, hd->buf, blocksize);
       hd->count = blocksize;
     }
 }
 #endif /* USE_S390X_CRYPTO */
 
 
 static void
 keccak_write (void *context, const void *inbuf_arg, size_t inlen)
 {
   KECCAK_CONTEXT *ctx = context;
   const size_t bsize = ctx->blocksize;
   const size_t blocklanes = bsize / 8;
   const byte *inbuf = inbuf_arg;
   unsigned int nburn, burn = 0;
   unsigned int count, i;
-  unsigned int pos, nlanes;
+  unsigned int pos;
+  size_t nlanes;
 
 #ifdef USE_S390X_CRYPTO
   if (ctx->kimd_func)
     {
       keccak_write_s390x (context, inbuf, inlen);
       return;
     }
 #endif
 
   count = ctx->count;
 
   if (inlen && (count % 8))
     {
       byte lane[8] = { 0, };
 
       /* Complete absorbing partial input lane. */
 
       pos = count / 8;
 
       for (i = count % 8; inlen && i < 8; i++)
 	{
 	  lane[i] = *inbuf++;
 	  inlen--;
 	  count++;
 	}
 
       if (count == bsize)
 	count = 0;
 
       nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1,
 			       (count % 8) ? -1 : blocklanes);
       burn = nburn > burn ? nburn : burn;
     }
 
   /* Absorb full input lanes. */
 
   pos = count / 8;
   nlanes = inlen / 8;
   if (nlanes > 0)
     {
       nburn = ctx->ops->absorb(&ctx->state, pos, inbuf, nlanes, blocklanes);
       burn = nburn > burn ? nburn : burn;
       inlen -= nlanes * 8;
       inbuf += nlanes * 8;
-      count += nlanes * 8;
-      count = count % bsize;
+      count = ((size_t) count + nlanes * 8) % bsize;
     }
 
   if (inlen)
     {
       byte lane[8] = { 0, };
 
       /* Absorb remaining partial input lane. */
 
       pos = count / 8;
 
       for (i = count % 8; inlen && i < 8; i++)
 	{
 	  lane[i] = *inbuf++;
 	  inlen--;
 	  count++;
 	}
 
       nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1, -1);
       burn = nburn > burn ? nburn : burn;
 
       gcry_assert(count < bsize);
     }
 
   ctx->count = count;
 
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 static void
 keccak_init (int algo, void *context, unsigned int flags)
 {
   KECCAK_CONTEXT *ctx = context;
   KECCAK_STATE *hd = &ctx->state;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
   (void)features;
 
   memset (hd, 0, sizeof *hd);
 
   ctx->count = 0;
 
   /* Select generic implementation. */
 #ifdef USE_64BIT
   ctx->ops = &keccak_generic64_ops;
 #elif defined USE_32BIT
   ctx->ops = &keccak_generic32bi_ops;
 #endif
 
   /* Select optimized implementation based in hw features. */
   if (0) {}
 #ifdef USE_64BIT_AVX512
   else if (features & HWF_INTEL_AVX512)
     ctx->ops = &keccak_avx512_64_ops;
 #endif
 #ifdef USE_64BIT_ARM_NEON
   else if (features & HWF_ARM_NEON)
     ctx->ops = &keccak_armv7_neon_64_ops;
 #endif
 #ifdef USE_64BIT_BMI2
   else if (features & HWF_INTEL_BMI2)
     ctx->ops = &keccak_bmi2_64_ops;
 #endif
 #ifdef USE_32BIT_BMI2
   else if (features & HWF_INTEL_BMI2)
     ctx->ops = &keccak_bmi2_32bi_ops;
 #endif
 #ifdef USE_64BIT_SHLD
   else if (features & HWF_INTEL_FAST_SHLD)
     ctx->ops = &keccak_shld_64_ops;
 #endif
 
   /* Set input block size, in Keccak terms this is called 'rate'. */
 
   switch (algo)
     {
     case GCRY_MD_SHA3_224:
       ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 1152 / 8;
       ctx->outlen = 224 / 8;
       break;
     case GCRY_MD_SHA3_256:
       ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 1088 / 8;
       ctx->outlen = 256 / 8;
       break;
     case GCRY_MD_SHA3_384:
       ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 832 / 8;
       ctx->outlen = 384 / 8;
       break;
     case GCRY_MD_SHA3_512:
       ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 576 / 8;
       ctx->outlen = 512 / 8;
       break;
     case GCRY_MD_SHAKE128:
       ctx->suffix = SHAKE_DELIMITED_SUFFIX;
       ctx->blocksize = 1344 / 8;
       ctx->outlen = 0;
       break;
     case GCRY_MD_SHAKE256:
       ctx->suffix = SHAKE_DELIMITED_SUFFIX;
       ctx->blocksize = 1088 / 8;
       ctx->outlen = 0;
       break;
     default:
       BUG();
     }
 
 #ifdef USE_S390X_CRYPTO
   ctx->kimd_func = 0;
   if ((features & HWF_S390X_MSA) != 0)
     {
       unsigned int kimd_func = 0;
 
       switch (algo)
 	{
 	case GCRY_MD_SHA3_224:
 	  kimd_func = KMID_FUNCTION_SHA3_224;
 	  break;
 	case GCRY_MD_SHA3_256:
 	  kimd_func = KMID_FUNCTION_SHA3_256;
 	  break;
 	case GCRY_MD_SHA3_384:
 	  kimd_func = KMID_FUNCTION_SHA3_384;
 	  break;
 	case GCRY_MD_SHA3_512:
 	  kimd_func = KMID_FUNCTION_SHA3_512;
 	  break;
 	case GCRY_MD_SHAKE128:
 	  kimd_func = KMID_FUNCTION_SHAKE128;
 	  break;
 	case GCRY_MD_SHAKE256:
 	  kimd_func = KMID_FUNCTION_SHAKE256;
 	  break;
 	}
 
       if ((kimd_query () & km_function_to_mask (kimd_func)) &&
 	  (klmd_query () & km_function_to_mask (kimd_func)))
 	{
 	  ctx->kimd_func = kimd_func;
 	}
     }
 #endif
 }
 
 static void
 sha3_224_init (void *context, unsigned int flags)
 {
   keccak_init (GCRY_MD_SHA3_224, context, flags);
 }
 
 static void
 sha3_256_init (void *context, unsigned int flags)
 {
   keccak_init (GCRY_MD_SHA3_256, context, flags);
 }
 
 static void
 sha3_384_init (void *context, unsigned int flags)
 {
   keccak_init (GCRY_MD_SHA3_384, context, flags);
 }
 
 static void
 sha3_512_init (void *context, unsigned int flags)
 {
   keccak_init (GCRY_MD_SHA3_512, context, flags);
 }
 
 static void
 shake128_init (void *context, unsigned int flags)
 {
   keccak_init (GCRY_MD_SHAKE128, context, flags);
 }
 
 static void
 shake256_init (void *context, unsigned int flags)
 {
   keccak_init (GCRY_MD_SHAKE256, context, flags);
 }
 
 /* The routine final terminates the computation and
  * returns the digest.
  * The handle is prepared for a new cycle, but adding bytes to the
  * handle will the destroy the returned buffer.
  * Returns: 64 bytes representing the digest.  When used for sha384,
  * we take the leftmost 48 of those bytes.
  */
 static void
 keccak_final (void *context)
 {
   KECCAK_CONTEXT *ctx = context;
   KECCAK_STATE *hd = &ctx->state;
   const size_t bsize = ctx->blocksize;
   const byte suffix = ctx->suffix;
   unsigned int nburn, burn = 0;
   unsigned int lastbytes;
   byte lane[8];
 
 #ifdef USE_S390X_CRYPTO
   if (ctx->kimd_func)
     {
       keccak_final_s390x (context);
       return;
     }
 #endif
 
   lastbytes = ctx->count;
 
   /* Do the padding and switch to the squeezing phase */
 
   /* Absorb the last few bits and add the first bit of padding (which
      coincides with the delimiter in delimited suffix) */
   buf_put_le64(lane, (u64)suffix << ((lastbytes % 8) * 8));
   nburn = ctx->ops->absorb(&ctx->state, lastbytes / 8, lane, 1, -1);
   burn = nburn > burn ? nburn : burn;
 
   /* Add the second bit of padding. */
   buf_put_le64(lane, (u64)0x80 << (((bsize - 1) % 8) * 8));
   nburn = ctx->ops->absorb(&ctx->state, (bsize - 1) / 8, lane, 1, -1);
   burn = nburn > burn ? nburn : burn;
 
   if (suffix == SHA3_DELIMITED_SUFFIX)
     {
       /* Switch to the squeezing phase. */
       nburn = ctx->ops->permute(hd);
       burn = nburn > burn ? nburn : burn;
 
       /* Squeeze out the SHA3 digest. */
       nburn = ctx->ops->extract(hd, 0, (void *)hd, ctx->outlen);
       burn = nburn > burn ? nburn : burn;
     }
   else
     {
       /* Output for SHAKE can now be read with md_extract(). */
 
       ctx->count = 0;
     }
 
   wipememory(lane, sizeof(lane));
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 static byte *
 keccak_read (void *context)
 {
   KECCAK_CONTEXT *ctx = (KECCAK_CONTEXT *) context;
   KECCAK_STATE *hd = &ctx->state;
   return (byte *)&hd->u;
 }
 
 
 static void
 keccak_extract (void *context, void *out, size_t outlen)
 {
   KECCAK_CONTEXT *ctx = context;
   KECCAK_STATE *hd = &ctx->state;
   const size_t bsize = ctx->blocksize;
   unsigned int nburn, burn = 0;
   byte *outbuf = out;
   unsigned int nlanes;
   unsigned int nleft;
   unsigned int count;
   unsigned int i;
   byte lane[8];
 
 #ifdef USE_S390X_CRYPTO
   if (ctx->kimd_func)
     {
       keccak_extract_s390x (context, out, outlen);
       return;
     }
 #endif
 
   count = ctx->count;
 
   while (count && outlen && (outlen < 8 || count % 8))
     {
       /* Extract partial lane. */
       nburn = ctx->ops->extract(hd, count / 8, lane, 8);
       burn = nburn > burn ? nburn : burn;
 
       for (i = count % 8; outlen && i < 8; i++)
 	{
 	  *outbuf++ = lane[i];
 	  outlen--;
 	  count++;
 	}
 
       gcry_assert(count <= bsize);
 
       if (count == bsize)
 	count = 0;
     }
 
   if (outlen >= 8 && count)
     {
       /* Extract tail of partial block. */
       nlanes = outlen / 8;
       nleft = (bsize - count) / 8;
       nlanes = nlanes < nleft ? nlanes : nleft;
 
       nburn = ctx->ops->extract(hd, count / 8, outbuf, nlanes * 8);
       burn = nburn > burn ? nburn : burn;
       outlen -= nlanes * 8;
       outbuf += nlanes * 8;
       count += nlanes * 8;
 
       gcry_assert(count <= bsize);
 
       if (count == bsize)
 	count = 0;
     }
 
   while (outlen >= bsize)
     {
       gcry_assert(count == 0);
 
       /* Squeeze more. */
       nburn = ctx->ops->permute(hd);
       burn = nburn > burn ? nburn : burn;
 
       /* Extract full block. */
       nburn = ctx->ops->extract(hd, 0, outbuf, bsize);
       burn = nburn > burn ? nburn : burn;
 
       outlen -= bsize;
       outbuf += bsize;
     }
 
   if (outlen)
     {
       gcry_assert(outlen < bsize);
 
       if (count == 0)
 	{
 	  /* Squeeze more. */
 	  nburn = ctx->ops->permute(hd);
 	  burn = nburn > burn ? nburn : burn;
 	}
 
       if (outlen >= 8)
 	{
 	  /* Extract head of partial block. */
 	  nlanes = outlen / 8;
 	  nburn = ctx->ops->extract(hd, count / 8, outbuf, nlanes * 8);
 	  burn = nburn > burn ? nburn : burn;
 	  outlen -= nlanes * 8;
 	  outbuf += nlanes * 8;
 	  count += nlanes * 8;
 
 	  gcry_assert(count < bsize);
 	}
 
       if (outlen)
 	{
 	  /* Extract head of partial lane. */
 	  nburn = ctx->ops->extract(hd, count / 8, lane, 8);
 	  burn = nburn > burn ? nburn : burn;
 
 	  for (i = count % 8; outlen && i < 8; i++)
 	    {
 	      *outbuf++ = lane[i];
 	      outlen--;
 	      count++;
 	    }
 
 	  gcry_assert(count < bsize);
 	}
     }
 
   ctx->count = count;
 
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 /* Variant of the above shortcut function using multiple buffers.  */
 static void
 _gcry_sha3_hash_buffers (void *outbuf, size_t nbytes, const gcry_buffer_t *iov,
 			 int iovcnt, const gcry_md_spec_t *spec)
 {
   KECCAK_CONTEXT hd;
 
   spec->init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     keccak_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len);
   keccak_final (&hd);
   if (spec->mdlen > 0)
     memcpy (outbuf, keccak_read (&hd), spec->mdlen);
   else
     keccak_extract (&hd, outbuf, nbytes);
 }
 
 
 static void
 _gcry_sha3_224_hash_buffers (void *outbuf, size_t nbytes,
 			     const gcry_buffer_t *iov, int iovcnt)
 {
   _gcry_sha3_hash_buffers (outbuf, nbytes, iov, iovcnt,
 			   &_gcry_digest_spec_sha3_224);
 }
 
 static void
 _gcry_sha3_256_hash_buffers (void *outbuf, size_t nbytes,
 			     const gcry_buffer_t *iov, int iovcnt)
 {
   _gcry_sha3_hash_buffers (outbuf, nbytes, iov, iovcnt,
 			   &_gcry_digest_spec_sha3_256);
 }
 
 static void
 _gcry_sha3_384_hash_buffers (void *outbuf, size_t nbytes,
 			     const gcry_buffer_t *iov, int iovcnt)
 {
   _gcry_sha3_hash_buffers (outbuf, nbytes, iov, iovcnt,
 			   &_gcry_digest_spec_sha3_384);
 }
 
 static void
 _gcry_sha3_512_hash_buffers (void *outbuf, size_t nbytes,
 			     const gcry_buffer_t *iov, int iovcnt)
 {
   _gcry_sha3_hash_buffers (outbuf, nbytes, iov, iovcnt,
 			   &_gcry_digest_spec_sha3_512);
 }
 
 static void
 _gcry_shake128_hash_buffers (void *outbuf, size_t nbytes,
 			     const gcry_buffer_t *iov, int iovcnt)
 {
   _gcry_sha3_hash_buffers (outbuf, nbytes, iov, iovcnt,
 			   &_gcry_digest_spec_shake128);
 }
 
 static void
 _gcry_shake256_hash_buffers (void *outbuf, size_t nbytes,
 			     const gcry_buffer_t *iov, int iovcnt)
 {
   _gcry_sha3_hash_buffers (outbuf, nbytes, iov, iovcnt,
 			   &_gcry_digest_spec_shake256);
 }
 
 
 /*
      Self-test section.
  */
 
 
 static gpg_err_code_t
 selftests_keccak (int algo, int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
   const char *short_hash;
   const char *long_hash;
   const char *one_million_a_hash;
   int hash_len;
 
   switch (algo)
   {
     default:
       BUG();
 
     case GCRY_MD_SHA3_224:
       short_hash =
 	"\xe6\x42\x82\x4c\x3f\x8c\xf2\x4a\xd0\x92\x34\xee\x7d\x3c\x76\x6f"
 	"\xc9\xa3\xa5\x16\x8d\x0c\x94\xad\x73\xb4\x6f\xdf";
       long_hash =
 	"\x54\x3e\x68\x68\xe1\x66\x6c\x1a\x64\x36\x30\xdf\x77\x36\x7a\xe5"
 	"\xa6\x2a\x85\x07\x0a\x51\xc1\x4c\xbf\x66\x5c\xbc";
       one_million_a_hash =
 	"\xd6\x93\x35\xb9\x33\x25\x19\x2e\x51\x6a\x91\x2e\x6d\x19\xa1\x5c"
 	"\xb5\x1c\x6e\xd5\xc1\x52\x43\xe7\xa7\xfd\x65\x3c";
       hash_len = 28;
       break;
 
     case GCRY_MD_SHA3_256:
       short_hash =
 	"\x3a\x98\x5d\xa7\x4f\xe2\x25\xb2\x04\x5c\x17\x2d\x6b\xd3\x90\xbd"
 	"\x85\x5f\x08\x6e\x3e\x9d\x52\x5b\x46\xbf\xe2\x45\x11\x43\x15\x32";
       long_hash =
 	"\x91\x6f\x60\x61\xfe\x87\x97\x41\xca\x64\x69\xb4\x39\x71\xdf\xdb"
 	"\x28\xb1\xa3\x2d\xc3\x6c\xb3\x25\x4e\x81\x2b\xe2\x7a\xad\x1d\x18";
       one_million_a_hash =
 	"\x5c\x88\x75\xae\x47\x4a\x36\x34\xba\x4f\xd5\x5e\xc8\x5b\xff\xd6"
 	"\x61\xf3\x2a\xca\x75\xc6\xd6\x99\xd0\xcd\xcb\x6c\x11\x58\x91\xc1";
       hash_len = 32;
       break;
 
     case GCRY_MD_SHA3_384:
       short_hash =
 	"\xec\x01\x49\x82\x88\x51\x6f\xc9\x26\x45\x9f\x58\xe2\xc6\xad\x8d"
 	"\xf9\xb4\x73\xcb\x0f\xc0\x8c\x25\x96\xda\x7c\xf0\xe4\x9b\xe4\xb2"
 	"\x98\xd8\x8c\xea\x92\x7a\xc7\xf5\x39\xf1\xed\xf2\x28\x37\x6d\x25";
       long_hash =
 	"\x79\x40\x7d\x3b\x59\x16\xb5\x9c\x3e\x30\xb0\x98\x22\x97\x47\x91"
 	"\xc3\x13\xfb\x9e\xcc\x84\x9e\x40\x6f\x23\x59\x2d\x04\xf6\x25\xdc"
 	"\x8c\x70\x9b\x98\xb4\x3b\x38\x52\xb3\x37\x21\x61\x79\xaa\x7f\xc7";
       one_million_a_hash =
 	"\xee\xe9\xe2\x4d\x78\xc1\x85\x53\x37\x98\x34\x51\xdf\x97\xc8\xad"
 	"\x9e\xed\xf2\x56\xc6\x33\x4f\x8e\x94\x8d\x25\x2d\x5e\x0e\x76\x84"
 	"\x7a\xa0\x77\x4d\xdb\x90\xa8\x42\x19\x0d\x2c\x55\x8b\x4b\x83\x40";
       hash_len = 48;
       break;
 
     case GCRY_MD_SHA3_512:
       short_hash =
 	"\xb7\x51\x85\x0b\x1a\x57\x16\x8a\x56\x93\xcd\x92\x4b\x6b\x09\x6e"
 	"\x08\xf6\x21\x82\x74\x44\xf7\x0d\x88\x4f\x5d\x02\x40\xd2\x71\x2e"
 	"\x10\xe1\x16\xe9\x19\x2a\xf3\xc9\x1a\x7e\xc5\x76\x47\xe3\x93\x40"
 	"\x57\x34\x0b\x4c\xf4\x08\xd5\xa5\x65\x92\xf8\x27\x4e\xec\x53\xf0";
       long_hash =
 	"\xaf\xeb\xb2\xef\x54\x2e\x65\x79\xc5\x0c\xad\x06\xd2\xe5\x78\xf9"
 	"\xf8\xdd\x68\x81\xd7\xdc\x82\x4d\x26\x36\x0f\xee\xbf\x18\xa4\xfa"
 	"\x73\xe3\x26\x11\x22\x94\x8e\xfc\xfd\x49\x2e\x74\xe8\x2e\x21\x89"
 	"\xed\x0f\xb4\x40\xd1\x87\xf3\x82\x27\x0c\xb4\x55\xf2\x1d\xd1\x85";
       one_million_a_hash =
 	"\x3c\x3a\x87\x6d\xa1\x40\x34\xab\x60\x62\x7c\x07\x7b\xb9\x8f\x7e"
 	"\x12\x0a\x2a\x53\x70\x21\x2d\xff\xb3\x38\x5a\x18\xd4\xf3\x88\x59"
 	"\xed\x31\x1d\x0a\x9d\x51\x41\xce\x9c\xc5\xc6\x6e\xe6\x89\xb2\x66"
 	"\xa8\xaa\x18\xac\xe8\x28\x2a\x0e\x0d\xb5\x96\xc9\x0b\x0a\x7b\x87";
       hash_len = 64;
       break;
 
     case GCRY_MD_SHAKE128:
       short_hash =
 	"\x58\x81\x09\x2d\xd8\x18\xbf\x5c\xf8\xa3\xdd\xb7\x93\xfb\xcb\xa7"
 	"\x40\x97\xd5\xc5\x26\xa6\xd3\x5f\x97\xb8\x33\x51\x94\x0f\x2c\xc8";
       long_hash =
 	"\x7b\x6d\xf6\xff\x18\x11\x73\xb6\xd7\x89\x8d\x7f\xf6\x3f\xb0\x7b"
 	"\x7c\x23\x7d\xaf\x47\x1a\x5a\xe5\x60\x2a\xdb\xcc\xef\x9c\xcf\x4b";
       one_million_a_hash =
 	"\x9d\x22\x2c\x79\xc4\xff\x9d\x09\x2c\xf6\xca\x86\x14\x3a\xa4\x11"
 	"\xe3\x69\x97\x38\x08\xef\x97\x09\x32\x55\x82\x6c\x55\x72\xef\x58";
       hash_len = 32;
       break;
 
     case GCRY_MD_SHAKE256:
       short_hash =
 	"\x48\x33\x66\x60\x13\x60\xa8\x77\x1c\x68\x63\x08\x0c\xc4\x11\x4d"
 	"\x8d\xb4\x45\x30\xf8\xf1\xe1\xee\x4f\x94\xea\x37\xe7\x8b\x57\x39";
       long_hash =
 	"\x98\xbe\x04\x51\x6c\x04\xcc\x73\x59\x3f\xef\x3e\xd0\x35\x2e\xa9"
 	"\xf6\x44\x39\x42\xd6\x95\x0e\x29\xa3\x72\xa6\x81\xc3\xde\xaf\x45";
       one_million_a_hash =
 	"\x35\x78\xa7\xa4\xca\x91\x37\x56\x9c\xdf\x76\xed\x61\x7d\x31\xbb"
 	"\x99\x4f\xca\x9c\x1b\xbf\x8b\x18\x40\x13\xde\x82\x34\xdf\xd1\x3a";
       hash_len = 32;
       break;
   }
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one (algo, 0, "abc", 3, short_hash,
 					  hash_len);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
 	(algo, 0,
 	"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
 	"hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
 	long_hash, hash_len);
       if (errtxt)
 	goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one (algo, 1, NULL, 0,
 					      one_million_a_hash, hash_len);
       if (errtxt)
 	goto failed;
     }
 
   return 0; /* Succeeded. */
 
 failed:
   if (report)
     report ("digest", algo, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
   gpg_err_code_t ec;
 
   switch (algo)
     {
     case GCRY_MD_SHA3_224:
     case GCRY_MD_SHA3_256:
     case GCRY_MD_SHA3_384:
     case GCRY_MD_SHA3_512:
     case GCRY_MD_SHAKE128:
     case GCRY_MD_SHAKE256:
       ec = selftests_keccak (algo, extended, report);
       break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
     }
 
   return ec;
 }
 
 
 
 
 static const byte sha3_224_asn[] = { 0x30 };
 static const gcry_md_oid_spec_t oid_spec_sha3_224[] =
   {
     { "2.16.840.1.101.3.4.2.7" },
     /* PKCS#1 sha3_224WithRSAEncryption */
     { "?" },
     { NULL }
   };
 static const byte sha3_256_asn[] = { 0x30 };
 static const gcry_md_oid_spec_t oid_spec_sha3_256[] =
   {
     { "2.16.840.1.101.3.4.2.8" },
     /* PKCS#1 sha3_256WithRSAEncryption */
     { "?" },
     { NULL }
   };
 static const byte sha3_384_asn[] = { 0x30 };
 static const gcry_md_oid_spec_t oid_spec_sha3_384[] =
   {
     { "2.16.840.1.101.3.4.2.9" },
     /* PKCS#1 sha3_384WithRSAEncryption */
     { "?" },
     { NULL }
   };
 static const byte sha3_512_asn[] = { 0x30 };
 static const gcry_md_oid_spec_t oid_spec_sha3_512[] =
   {
     { "2.16.840.1.101.3.4.2.10" },
     /* PKCS#1 sha3_512WithRSAEncryption */
     { "?" },
     { NULL }
   };
 static const byte shake128_asn[] = { 0x30 };
 static const gcry_md_oid_spec_t oid_spec_shake128[] =
   {
     { "2.16.840.1.101.3.4.2.11" },
     /* PKCS#1 shake128WithRSAEncryption */
     { "?" },
     { NULL }
   };
 static const byte shake256_asn[] = { 0x30 };
 static const gcry_md_oid_spec_t oid_spec_shake256[] =
   {
     { "2.16.840.1.101.3.4.2.12" },
     /* PKCS#1 shake256WithRSAEncryption */
     { "?" },
     { NULL }
   };
 
 const gcry_md_spec_t _gcry_digest_spec_sha3_224 =
   {
     GCRY_MD_SHA3_224, {0, 1},
     "SHA3-224", sha3_224_asn, DIM (sha3_224_asn), oid_spec_sha3_224, 28,
     sha3_224_init, keccak_write, keccak_final, keccak_read, NULL,
     _gcry_sha3_224_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
 const gcry_md_spec_t _gcry_digest_spec_sha3_256 =
   {
     GCRY_MD_SHA3_256, {0, 1},
     "SHA3-256", sha3_256_asn, DIM (sha3_256_asn), oid_spec_sha3_256, 32,
     sha3_256_init, keccak_write, keccak_final, keccak_read, NULL,
     _gcry_sha3_256_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
 const gcry_md_spec_t _gcry_digest_spec_sha3_384 =
   {
     GCRY_MD_SHA3_384, {0, 1},
     "SHA3-384", sha3_384_asn, DIM (sha3_384_asn), oid_spec_sha3_384, 48,
     sha3_384_init, keccak_write, keccak_final, keccak_read, NULL,
     _gcry_sha3_384_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
 const gcry_md_spec_t _gcry_digest_spec_sha3_512 =
   {
     GCRY_MD_SHA3_512, {0, 1},
     "SHA3-512", sha3_512_asn, DIM (sha3_512_asn), oid_spec_sha3_512, 64,
     sha3_512_init, keccak_write, keccak_final, keccak_read, NULL,
     _gcry_sha3_512_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
 const gcry_md_spec_t _gcry_digest_spec_shake128 =
   {
     GCRY_MD_SHAKE128, {0, 1},
     "SHAKE128", shake128_asn, DIM (shake128_asn), oid_spec_shake128, 0,
     shake128_init, keccak_write, keccak_final, NULL, keccak_extract,
     _gcry_shake128_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
 const gcry_md_spec_t _gcry_digest_spec_shake256 =
   {
     GCRY_MD_SHAKE256, {0, 1},
     "SHAKE256", shake256_asn, DIM (shake256_asn), oid_spec_shake256, 0,
     shake256_init, keccak_write, keccak_final, NULL, keccak_extract,
     _gcry_shake256_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index b28c871e..45ef462f 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -1,385 +1,385 @@
 /* keccak_permute_64.h - Keccak permute function (simple 64bit)
  * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /* The code is based on public-domain/CC0 "keccakc1024/simple/Keccak-simple.c"
  * implementation by Ronny Van Keer from SUPERCOP toolkit package.
  */
 
 /* Function that computes the Keccak-f[1600] permutation on the given state. */
 static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
   const u64 *round_consts = _gcry_keccak_round_consts_64bit;
   const u64 *round_consts_end = _gcry_keccak_round_consts_64bit + 24;
   u64 Aba, Abe, Abi, Abo, Abu;
   u64 Aga, Age, Agi, Ago, Agu;
   u64 Aka, Ake, Aki, Ako, Aku;
   u64 Ama, Ame, Ami, Amo, Amu;
   u64 Asa, Ase, Asi, Aso, Asu;
   u64 BCa, BCe, BCi, BCo, BCu;
   u64 Da, De, Di, Do, Du;
   u64 Eba, Ebe, Ebi, Ebo, Ebu;
   u64 Ega, Ege, Egi, Ego, Egu;
   u64 Eka, Eke, Eki, Eko, Eku;
   u64 Ema, Eme, Emi, Emo, Emu;
   u64 Esa, Ese, Esi, Eso, Esu;
   u64 *state = hd->u.state64;
 
   Aba = state[0];
   Abe = state[1];
   Abi = state[2];
   Abo = state[3];
   Abu = state[4];
   Aga = state[5];
   Age = state[6];
   Agi = state[7];
   Ago = state[8];
   Agu = state[9];
   Aka = state[10];
   Ake = state[11];
   Aki = state[12];
   Ako = state[13];
   Aku = state[14];
   Ama = state[15];
   Ame = state[16];
   Ami = state[17];
   Amo = state[18];
   Amu = state[19];
   Asa = state[20];
   Ase = state[21];
   Asi = state[22];
   Aso = state[23];
   Asu = state[24];
 
   do
     {
       /* prepareTheta */
       BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
       BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
       BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
       BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
       BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
 
       /* thetaRhoPiChiIotaPrepareTheta(round  , A, E) */
       Da = BCu ^ ROL64(BCe, 1);
       De = BCa ^ ROL64(BCi, 1);
       Di = BCe ^ ROL64(BCo, 1);
       Do = BCi ^ ROL64(BCu, 1);
       Du = BCo ^ ROL64(BCa, 1);
 
       Aba ^= Da;
       BCa = Aba;
       Age ^= De;
       BCe = ROL64(Age, 44);
       Aki ^= Di;
       BCi = ROL64(Aki, 43);
       Amo ^= Do;
       BCo = ROL64(Amo, 21);
       Asu ^= Du;
       BCu = ROL64(Asu, 14);
       Eba = BCa ^ ANDN64(BCe, BCi);
       Eba ^= *(round_consts++);
       Ebe = BCe ^ ANDN64(BCi, BCo);
       Ebi = BCi ^ ANDN64(BCo, BCu);
       Ebo = BCo ^ ANDN64(BCu, BCa);
       Ebu = BCu ^ ANDN64(BCa, BCe);
 
       Abo ^= Do;
       BCa = ROL64(Abo, 28);
       Agu ^= Du;
       BCe = ROL64(Agu, 20);
       Aka ^= Da;
       BCi = ROL64(Aka, 3);
       Ame ^= De;
       BCo = ROL64(Ame, 45);
       Asi ^= Di;
       BCu = ROL64(Asi, 61);
       Ega = BCa ^ ANDN64(BCe, BCi);
       Ege = BCe ^ ANDN64(BCi, BCo);
       Egi = BCi ^ ANDN64(BCo, BCu);
       Ego = BCo ^ ANDN64(BCu, BCa);
       Egu = BCu ^ ANDN64(BCa, BCe);
 
       Abe ^= De;
       BCa = ROL64(Abe, 1);
       Agi ^= Di;
       BCe = ROL64(Agi, 6);
       Ako ^= Do;
       BCi = ROL64(Ako, 25);
       Amu ^= Du;
       BCo = ROL64(Amu, 8);
       Asa ^= Da;
       BCu = ROL64(Asa, 18);
       Eka = BCa ^ ANDN64(BCe, BCi);
       Eke = BCe ^ ANDN64(BCi, BCo);
       Eki = BCi ^ ANDN64(BCo, BCu);
       Eko = BCo ^ ANDN64(BCu, BCa);
       Eku = BCu ^ ANDN64(BCa, BCe);
 
       Abu ^= Du;
       BCa = ROL64(Abu, 27);
       Aga ^= Da;
       BCe = ROL64(Aga, 36);
       Ake ^= De;
       BCi = ROL64(Ake, 10);
       Ami ^= Di;
       BCo = ROL64(Ami, 15);
       Aso ^= Do;
       BCu = ROL64(Aso, 56);
       Ema = BCa ^ ANDN64(BCe, BCi);
       Eme = BCe ^ ANDN64(BCi, BCo);
       Emi = BCi ^ ANDN64(BCo, BCu);
       Emo = BCo ^ ANDN64(BCu, BCa);
       Emu = BCu ^ ANDN64(BCa, BCe);
 
       Abi ^= Di;
       BCa = ROL64(Abi, 62);
       Ago ^= Do;
       BCe = ROL64(Ago, 55);
       Aku ^= Du;
       BCi = ROL64(Aku, 39);
       Ama ^= Da;
       BCo = ROL64(Ama, 41);
       Ase ^= De;
       BCu = ROL64(Ase, 2);
       Esa = BCa ^ ANDN64(BCe, BCi);
       Ese = BCe ^ ANDN64(BCi, BCo);
       Esi = BCi ^ ANDN64(BCo, BCu);
       Eso = BCo ^ ANDN64(BCu, BCa);
       Esu = BCu ^ ANDN64(BCa, BCe);
 
       /* prepareTheta */
       BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
       BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
       BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
       BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
       BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
 
       /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
       Da = BCu ^ ROL64(BCe, 1);
       De = BCa ^ ROL64(BCi, 1);
       Di = BCe ^ ROL64(BCo, 1);
       Do = BCi ^ ROL64(BCu, 1);
       Du = BCo ^ ROL64(BCa, 1);
 
       Eba ^= Da;
       BCa = Eba;
       Ege ^= De;
       BCe = ROL64(Ege, 44);
       Eki ^= Di;
       BCi = ROL64(Eki, 43);
       Emo ^= Do;
       BCo = ROL64(Emo, 21);
       Esu ^= Du;
       BCu = ROL64(Esu, 14);
       Aba = BCa ^ ANDN64(BCe, BCi);
       Aba ^= *(round_consts++);
       Abe = BCe ^ ANDN64(BCi, BCo);
       Abi = BCi ^ ANDN64(BCo, BCu);
       Abo = BCo ^ ANDN64(BCu, BCa);
       Abu = BCu ^ ANDN64(BCa, BCe);
 
       Ebo ^= Do;
       BCa = ROL64(Ebo, 28);
       Egu ^= Du;
       BCe = ROL64(Egu, 20);
       Eka ^= Da;
       BCi = ROL64(Eka, 3);
       Eme ^= De;
       BCo = ROL64(Eme, 45);
       Esi ^= Di;
       BCu = ROL64(Esi, 61);
       Aga = BCa ^ ANDN64(BCe, BCi);
       Age = BCe ^ ANDN64(BCi, BCo);
       Agi = BCi ^ ANDN64(BCo, BCu);
       Ago = BCo ^ ANDN64(BCu, BCa);
       Agu = BCu ^ ANDN64(BCa, BCe);
 
       Ebe ^= De;
       BCa = ROL64(Ebe, 1);
       Egi ^= Di;
       BCe = ROL64(Egi, 6);
       Eko ^= Do;
       BCi = ROL64(Eko, 25);
       Emu ^= Du;
       BCo = ROL64(Emu, 8);
       Esa ^= Da;
       BCu = ROL64(Esa, 18);
       Aka = BCa ^ ANDN64(BCe, BCi);
       Ake = BCe ^ ANDN64(BCi, BCo);
       Aki = BCi ^ ANDN64(BCo, BCu);
       Ako = BCo ^ ANDN64(BCu, BCa);
       Aku = BCu ^ ANDN64(BCa, BCe);
 
       Ebu ^= Du;
       BCa = ROL64(Ebu, 27);
       Ega ^= Da;
       BCe = ROL64(Ega, 36);
       Eke ^= De;
       BCi = ROL64(Eke, 10);
       Emi ^= Di;
       BCo = ROL64(Emi, 15);
       Eso ^= Do;
       BCu = ROL64(Eso, 56);
       Ama = BCa ^ ANDN64(BCe, BCi);
       Ame = BCe ^ ANDN64(BCi, BCo);
       Ami = BCi ^ ANDN64(BCo, BCu);
       Amo = BCo ^ ANDN64(BCu, BCa);
       Amu = BCu ^ ANDN64(BCa, BCe);
 
       Ebi ^= Di;
       BCa = ROL64(Ebi, 62);
       Ego ^= Do;
       BCe = ROL64(Ego, 55);
       Eku ^= Du;
       BCi = ROL64(Eku, 39);
       Ema ^= Da;
       BCo = ROL64(Ema, 41);
       Ese ^= De;
       BCu = ROL64(Ese, 2);
       Asa = BCa ^ ANDN64(BCe, BCi);
       Ase = BCe ^ ANDN64(BCi, BCo);
       Asi = BCi ^ ANDN64(BCo, BCu);
       Aso = BCo ^ ANDN64(BCu, BCa);
       Asu = BCu ^ ANDN64(BCa, BCe);
     }
   while (round_consts < round_consts_end);
 
   state[0] = Aba;
   state[1] = Abe;
   state[2] = Abi;
   state[3] = Abo;
   state[4] = Abu;
   state[5] = Aga;
   state[6] = Age;
   state[7] = Agi;
   state[8] = Ago;
   state[9] = Agu;
   state[10] = Aka;
   state[11] = Ake;
   state[12] = Aki;
   state[13] = Ako;
   state[14] = Aku;
   state[15] = Ama;
   state[16] = Ame;
   state[17] = Ami;
   state[18] = Amo;
   state[19] = Amu;
   state[20] = Asa;
   state[21] = Ase;
   state[22] = Asi;
   state[23] = Aso;
   state[24] = Asu;
 
   return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
 }
 
 static unsigned int
 KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
-			      unsigned int nlanes, int blocklanes)
+			      size_t nlanes, int blocklanes)
 {
   unsigned int burn = 0;
 
   while (nlanes)
     {
       switch (blocklanes)
 	{
 	case 21:
 	  /* SHAKE128 */
 	  while (pos == 0 && nlanes >= 21)
 	    {
 	      nlanes -= 21;
 	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
 	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
 	      absorb_lanes64_4(&hd->u.state64[16], lanes); lanes += 8 * 4;
 	      absorb_lanes64_1(&hd->u.state64[20], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
 	  break;
 
 	case 18:
 	  /* SHA3-224 */
 	  while (pos == 0 && nlanes >= 18)
 	    {
 	      nlanes -= 18;
 	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
 	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
 	      absorb_lanes64_2(&hd->u.state64[16], lanes); lanes += 8 * 2;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
 	  break;
 
 	case 17:
 	  /* SHA3-256 & SHAKE256 */
 	  while (pos == 0 && nlanes >= 17)
 	    {
 	      nlanes -= 17;
 	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
 	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
 	      absorb_lanes64_1(&hd->u.state64[16], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
 	  break;
 
 	case 13:
 	  /* SHA3-384 */
 	  while (pos == 0 && nlanes >= 13)
 	    {
 	      nlanes -= 13;
 	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
 	      absorb_lanes64_4(&hd->u.state64[8], lanes); lanes += 8 * 4;
 	      absorb_lanes64_1(&hd->u.state64[12], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
 	  break;
 
 	case 9:
 	  /* SHA3-512 */
 	  while (pos == 0 && nlanes >= 9)
 	    {
 	      nlanes -= 9;
 	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
 	      absorb_lanes64_1(&hd->u.state64[8], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
 	  break;
 	}
 
       while (nlanes)
 	{
 	  hd->u.state64[pos] ^= buf_get_le64(lanes);
 	  lanes += 8;
 	  nlanes--;
 
 	  if (++pos == blocklanes)
 	    {
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	      pos = 0;
 	      break;
 	    }
 	}
     }
 
   return burn;
 }