diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c
index 2f60c09d..4f75e95c 100644
--- a/cipher/cipher-gcm-ppc.c
+++ b/cipher/cipher-gcm-ppc.c
@@ -1,544 +1,551 @@
 /* cipher-gcm-ppc.c  -  Power 8 vpmsum accelerated Galois Counter Mode
  *                      implementation
  * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Based on GHASH implementation by Andy Polyakov from CRYPTOGAMS
  * distribution (ppc/ghashp8-ppc.pl). Specifically, it uses his register
  * allocation (which then defers to your compiler's register allocation),
  * instead of re-implementing Gerald Estrin's Scheme of parallelized
  * multiplication of polynomials, as I did not understand this algorithm at
  * the time.
  *
  * Original copyright license follows:
  *
  *  Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
  *  All rights reserved.
  *
  *  Redistribution and use in source and binary forms, with or without
  *  modification, are permitted provided that the following conditions
  *  are met:
  *
  *        * Redistributions of source code must retain copyright notices,
  *          this list of conditions and the following disclaimer.
  *
  *        * Redistributions in binary form must reproduce the above
  *          copyright notice, this list of conditions and the following
  *          disclaimer in the documentation and/or other materials
  *          provided with the distribution.
  *
  *        * Neither the name of the CRYPTOGAMS nor the names of its
  *          copyright holder and contributors may be used to endorse or
  *          promote products derived from this software without specific
  *          prior written permission.
  *
  *  ALTERNATIVELY, provided that this notice is retained in full, this
  *  product may be distributed under the terms of the GNU General Public
  *  License (GPL), in which case the provisions of the GPL apply INSTEAD OF
  *  those given above.
  *
  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only)
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 #include <stdint.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
 #ifdef GCM_USE_PPC_VPMSUM
 
 #include <altivec.h>
 
 #define ALWAYS_INLINE inline __attribute__((always_inline))
 #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
 
 #define ASM_FUNC_ATTR        NO_INSTRUMENT_FUNCTION
 #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
 
+#define ALIGNED_16 __attribute__ ((aligned (16)))
+
 typedef vector unsigned char vector16x_u8;
 typedef vector signed char vector16x_s8;
 typedef vector unsigned long long vector2x_u64;
 typedef vector unsigned long long block;
 
 static ASM_FUNC_ATTR_INLINE block
 asm_xor(block a, block b)
 {
   block r;
   __asm__ volatile ("xxlxor %x0, %x1, %x2"
 		    : "=wa" (r)
 		    : "wa" (a), "wa" (b));
   return r;
 }
 
 static ASM_FUNC_ATTR_INLINE block
 asm_vpmsumd(block a, block b)
 {
   block r;
   __asm__ volatile ("vpmsumd %0, %1, %2"
 		    : "=v" (r)
 		    : "v" (a), "v" (b));
   return r;
 }
 
 static ASM_FUNC_ATTR_INLINE block
 asm_swap_u64(block a)
 {
   block r;
   __asm__ volatile ("xxswapd %x0, %x1"
 		    : "=wa" (r)
 		    : "wa" (a));
   return r;
 }
 
 static ASM_FUNC_ATTR_INLINE block
 asm_mergelo(block l, block r)
 {
   block ret;
   __asm__ volatile ("xxmrgld %x0, %x1, %x2\n\t"
 		    : "=wa" (ret)
 		    : "wa" (l), "wa" (r));
   return ret;
 }
 
 static ASM_FUNC_ATTR_INLINE block
 asm_mergehi(block l, block r)
 {
   block ret;
   __asm__ volatile ("xxmrghd %x0, %x1, %x2\n\t"
 		    : "=wa" (ret)
 		    : "wa" (l), "wa" (r));
   return ret;
 }
 
 static ASM_FUNC_ATTR_INLINE block
 asm_rot_block_left(block a)
 {
   block r;
   block zero = { 0, 0 };
   __asm__ volatile ("xxmrgld %x0, %x1, %x2"
 		    : "=wa" (r)
 		    : "wa" (a), "wa" (zero));
   return r;
 }
 
 static ASM_FUNC_ATTR_INLINE block
 asm_rot_block_right(block a)
 {
   block r;
   block zero = { 0, 0 };
   __asm__ volatile ("xxsldwi %x0, %x2, %x1, 2"
 		    : "=wa" (r)
 		    : "wa" (a), "wa" (zero));
   return r;
 }
 
 /* vsl is a slightly strange function in the way the shift is passed... */
 static ASM_FUNC_ATTR_INLINE block
 asm_ashl_128(block a, vector16x_u8 shift)
 {
   block r;
   __asm__ volatile ("vsl %0, %1, %2"
 		    : "=v" (r)
 		    : "v" (a), "v" (shift));
   return r;
 }
 
 #define STORE_TABLE(gcm_table, slot, vec) \
   vec_store_he (((block)vec), slot * 16, (unsigned char *)(gcm_table));
 
 static ASM_FUNC_ATTR_INLINE void
 vec_store_he(block vec, unsigned long offset, unsigned char *ptr)
 {
-#ifndef WORDS_BIGENDIAN
   /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
    * lxvd2x directly instead. */
 #if __GNUC__ >= 4
   if (__builtin_constant_p (offset) && offset == 0)
     __asm__ volatile ("stxvd2x %x0, 0, %1\n\t"
 		    :
 		    : "wa" (vec), "r" ((uintptr_t)ptr)
 		    : "memory", "r0");
   else
 #endif
     __asm__ volatile ("stxvd2x %x0, %1, %2\n\t"
 		      :
 		      : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
 		      : "memory", "r0");
-#else
-  vec_vsx_st ((vector16x_u8)vec, offset, ptr);
-#endif
 }
 
 #define VEC_LOAD_BE(in_ptr, bswap_const) \
   vec_be_swap(vec_load_he (0, (const unsigned char *)(in_ptr)), bswap_const)
 
 static ASM_FUNC_ATTR_INLINE block
 vec_load_he(unsigned long offset, const unsigned char *ptr)
 {
-#ifndef WORDS_BIGENDIAN
   block vec;
   /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
    * lxvd2x directly instead. */
 #if __GNUC__ >= 4
   if (__builtin_constant_p (offset) && offset == 0)
     __asm__ volatile ("lxvd2x %x0, 0, %1\n\t"
 		    : "=wa" (vec)
 		    : "r" ((uintptr_t)ptr)
 		    : "memory", "r0");
   else
 #endif
     __asm__ volatile ("lxvd2x %x0, %1, %2\n\t"
 		      : "=wa" (vec)
 		      : "r" (offset), "r" ((uintptr_t)ptr)
 		      : "memory", "r0");
   return vec;
-#else
-  return vec_vsx_ld (offset, ptr);
-#endif
 }
 
 static ASM_FUNC_ATTR_INLINE block
 vec_be_swap(block vec, vector16x_u8 be_bswap_const)
 {
 #ifndef WORDS_BIGENDIAN
   __asm__ volatile ("vperm %0, %1, %1, %2\n\t"
 		    : "=v" (vec)
 		    : "v" (vec), "v" (be_bswap_const));
 #else
   (void)be_bswap_const;
 #endif
   return vec;
 }
 
+static ASM_FUNC_ATTR_INLINE block
+vec_dup_byte_elem(block vec, int idx)
+{
+#ifndef WORDS_BIGENDIAN
+  return (block)vec_splat((vector16x_s8)vec, idx);
+#else
+  return (block)vec_splat((vector16x_s8)vec, (15 - idx) & 15);
+#endif
+}
 
 /* Power ghash based on papers:
    "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega
    "Intel® Carry-Less Multiplication Instruction and its Usage for Computing
     the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
 
    After saving the magic c2 constant and pre-formatted version of the key,
    we pre-process the key for parallel hashing. This takes advantage of the
    identity of addition over a galois field being identital to XOR, and thus
    can be parellized (S 2.2, page 3). We multiply and add (galois field
    versions) the key over multiple iterations and save the result. This can
    later be galois added (XORed) with parallel processed input (Estrin's
    Scheme).
 
    The ghash "key" is a salt. */
 void ASM_FUNC_ATTR
-_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
+_gcry_ghash_setup_ppc_vpmsum (void *gcm_table_arg, void *gcm_key)
 {
-  static const vector16x_u8 bswap_const =
+  static const vector16x_u8 bswap_const ALIGNED_16 =
     { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
-  static const vector16x_u8 c2 =
-    { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 };
-  static const vector16x_u8 one =
+  static const byte c2[16] ALIGNED_16 =
+    { 0xc2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+  static const vector16x_u8 one ALIGNED_16 =
     { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+  uint64_t *gcm_table = gcm_table_arg;
   block T0, T1, T2;
   block C2, H, H1, H1l, H1h, H2, H2l, H2h;
   block H3l, H3, H3h, H4l, H4, H4h, T3, T4;
   vector16x_s8 most_sig_of_H, t7, carry;
 
   H = VEC_LOAD_BE(gcm_key, bswap_const);
-  most_sig_of_H = vec_splat((vector16x_s8)H, 15);
+  C2 = VEC_LOAD_BE(c2, bswap_const);
+  most_sig_of_H = (vector16x_s8)vec_dup_byte_elem(H, 15);
   t7 = vec_splat_s8(7);
   carry = most_sig_of_H >> t7;
-  carry &= c2; /* only interested in certain carries. */
+  carry &= (vector16x_s8)C2; /* only interested in certain carries. */
   H1 = asm_ashl_128(H, one);
   H1 ^= (block)carry; /* complete the <<< 1 */
 
   T1 = asm_swap_u64 (H1);
   H1l = asm_rot_block_right (T1);
   H1h = asm_rot_block_left (T1);
-  C2 = asm_rot_block_right ((block)c2);
+  C2 = asm_rot_block_right (C2);
 
   STORE_TABLE (gcm_table, 0, C2);
   STORE_TABLE (gcm_table, 1, H1l);
   STORE_TABLE (gcm_table, 2, T1);
   STORE_TABLE (gcm_table, 3, H1h);
 
   /* pre-process coefficients for Gerald Estrin's scheme for parallel
    * multiplication of polynomials
    */
   H2l = asm_vpmsumd (H1l, H1); /* do not need to mask in
                                   because 0 * anything -> 0 */
   H2 = asm_vpmsumd (T1, H1);
   H2h = asm_vpmsumd (H1h, H1);
 
   /* reduce 1 */
   T0 = asm_vpmsumd (H2l, C2);
 
   H2l ^= asm_rot_block_left (H2);
   H2h ^= asm_rot_block_right (H2);
   H2l = asm_swap_u64 (H2l);
   H2l ^= T0;
   /* reduce 2 */
   T0 = asm_swap_u64 (H2l);
   H2l = asm_vpmsumd (H2l, C2);
   H2 = H2l ^ H2h ^ T0;
 
   T2 = asm_swap_u64 (H2);
   H2l = asm_rot_block_right (T2);
   H2h = asm_rot_block_left (T2);
 
   STORE_TABLE (gcm_table, 4, H2l);
   STORE_TABLE (gcm_table, 5, T2);
   STORE_TABLE (gcm_table, 6, H2h);
 
   H3l = asm_vpmsumd (H2l, H1);
   H4l = asm_vpmsumd (H2l, H2);
   H3 = asm_vpmsumd (T2, H1);
   H4 = asm_vpmsumd (T2, H2);
   H3h = asm_vpmsumd (H2h, H1);
   H4h = asm_vpmsumd (H2h, H2);
 
   T3 = asm_vpmsumd (H3l, C2);
   T4 = asm_vpmsumd (H4l, C2);
 
   H3l ^= asm_rot_block_left (H3);
   H3h ^= asm_rot_block_right (H3);
   H4l ^= asm_rot_block_left (H4);
   H4h ^= asm_rot_block_right (H4);
 
   H3 = asm_swap_u64 (H3l);
   H4 = asm_swap_u64 (H4l);
 
   H3 ^= T3;
   H4 ^= T4;
 
   /* We could have also b64 switched reduce and reduce2, however as we are
      using the unrotated H and H2 above to vpmsum, this is marginally better. */
   T3 = asm_swap_u64 (H3);
   T4 = asm_swap_u64 (H4);
 
   H3 = asm_vpmsumd (H3, C2);
   H4 = asm_vpmsumd (H4, C2);
 
   T3 ^= H3h;
   T4 ^= H4h;
   H3 ^= T3;
   H4 ^= T4;
   H3 = asm_swap_u64 (H3);
   H4 = asm_swap_u64 (H4);
 
   H3l = asm_rot_block_right (H3);
   H3h = asm_rot_block_left (H3);
   H4l = asm_rot_block_right (H4);
   H4h = asm_rot_block_left (H4);
 
   STORE_TABLE (gcm_table, 7, H3l);
   STORE_TABLE (gcm_table, 8, H3);
   STORE_TABLE (gcm_table, 9, H3h);
   STORE_TABLE (gcm_table, 10, H4l);
   STORE_TABLE (gcm_table, 11, H4);
   STORE_TABLE (gcm_table, 12, H4h);
 }
 
-void ASM_FUNC_ATTR
-_gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table,
+unsigned int ASM_FUNC_ATTR
+_gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
 			const byte *buf, const size_t nblocks)
 {
-  static const vector16x_u8 bswap_const =
+  static const vector16x_u8 bswap_const ALIGNED_16 =
     { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
   block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl;
   block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur;
   size_t blocks_remaining = nblocks;
   size_t not_multiple_of_four;
   block t0;
 
   cur = vec_be_swap (vec_load_he (0, result), bswap_const);
 
   c2 = vec_load_he (0, gcm_table);
   H0l = vec_load_he (16, gcm_table);
   H0m = vec_load_he (32, gcm_table);
   H0h = vec_load_he (48, gcm_table);
 
   for (not_multiple_of_four = nblocks % 4; not_multiple_of_four;
        not_multiple_of_four--)
     {
       in = vec_be_swap (vec_load_he (0, buf), bswap_const);
       buf += 16;
       blocks_remaining--;
       cur ^= in;
 
       Hl = asm_vpmsumd (cur, H0l);
       Hm = asm_vpmsumd (cur, H0m);
       Hh = asm_vpmsumd (cur, H0h);
 
       t0 = asm_vpmsumd (Hl, c2);
 
       Hl ^= asm_rot_block_left (Hm);
 
       Hm_right = asm_rot_block_right (Hm);
       Hh ^= Hm_right;
       Hl_rotate = asm_swap_u64 (Hl);
       Hl_rotate ^= t0;
       Hl = asm_swap_u64 (Hl_rotate);
       Hl_rotate = asm_vpmsumd (Hl_rotate, c2);
       Hl ^= Hh;
       Hl ^= Hl_rotate;
 
       cur = Hl;
   }
 
   if (blocks_remaining > 0)
     {
       block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate;
       block H21l, H21h, merge_l, merge_h;
       block t1, t2;
 
       H2m = vec_load_he (48 + 32, gcm_table);
       H3l = vec_load_he (48 * 2 + 16, gcm_table);
       H3m = vec_load_he (48 * 2 + 32, gcm_table);
       H3h = vec_load_he (48 * 2 + 48, gcm_table);
       H4l = vec_load_he (48 * 3 + 16, gcm_table);
       H4m = vec_load_he (48 * 3 + 32, gcm_table);
       H4h = vec_load_he (48 * 3 + 48, gcm_table);
 
       in0 = vec_load_he (0, buf);
       in1 = vec_load_he (16, buf);
       in2 = vec_load_he (32, buf);
       in3 = vec_load_he (48, buf);
       in0 = vec_be_swap(in0, bswap_const);
       in1 = vec_be_swap(in1, bswap_const);
       in2 = vec_be_swap(in2, bswap_const);
       in3 = vec_be_swap(in3, bswap_const);
 
       Xh = asm_xor (in0, cur);
 
       Xl1 = asm_vpmsumd (in1, H3l);
       Xm1 = asm_vpmsumd (in1, H3m);
       Xh1 = asm_vpmsumd (in1, H3h);
 
       H21l = asm_mergehi (H2m, H0m);
       H21h = asm_mergelo (H2m, H0m);
       merge_l = asm_mergelo (in2, in3);
       merge_h = asm_mergehi (in2, in3);
 
       Xm2 = asm_vpmsumd (in2, H2m);
       Xl3 = asm_vpmsumd (merge_l, H21l);
       Xm3 = asm_vpmsumd (in3, H0m);
       Xh3 = asm_vpmsumd (merge_h, H21h);
 
       Xm2 = asm_xor (Xm2, Xm1);
       Xl3 = asm_xor (Xl3, Xl1);
       Xm3 = asm_xor (Xm3, Xm2);
       Xh3 = asm_xor (Xh3, Xh1);
 
       /* Gerald Estrin's scheme for parallel multiplication of polynomials */
       while (1)
         {
 	  buf += 64;
 	  blocks_remaining -= 4;
 	  if (!blocks_remaining)
 	    break;
 
 	  in0 = vec_load_he (0, buf);
 	  in1 = vec_load_he (16, buf);
 	  in2 = vec_load_he (32, buf);
 	  in3 = vec_load_he (48, buf);
 	  in1 = vec_be_swap(in1, bswap_const);
 	  in2 = vec_be_swap(in2, bswap_const);
 	  in3 = vec_be_swap(in3, bswap_const);
 	  in0 = vec_be_swap(in0, bswap_const);
 
 	  Xl = asm_vpmsumd (Xh, H4l);
 	  Xm = asm_vpmsumd (Xh, H4m);
 	  Xh = asm_vpmsumd (Xh, H4h);
 	  Xl1 = asm_vpmsumd (in1, H3l);
 	  Xm1 = asm_vpmsumd (in1, H3m);
 	  Xh1 = asm_vpmsumd (in1, H3h);
 
 	  Xl = asm_xor (Xl, Xl3);
 	  Xm = asm_xor (Xm, Xm3);
 	  Xh = asm_xor (Xh, Xh3);
 	  merge_l = asm_mergelo (in2, in3);
 	  merge_h = asm_mergehi (in2, in3);
 
 	  t0 = asm_vpmsumd (Xl, c2);
 	  Xl3 = asm_vpmsumd (merge_l, H21l);
 	  Xh3 = asm_vpmsumd (merge_h, H21h);
 
 	  t1 = asm_rot_block_left (Xm);
 	  t2 = asm_rot_block_right (Xm);
 	  Xl = asm_xor(Xl, t1);
 	  Xh = asm_xor(Xh, t2);
 
 	  Xl = asm_swap_u64 (Xl);
 	  Xl = asm_xor(Xl, t0);
 
 	  Xl_rotate = asm_swap_u64 (Xl);
 	  Xm2 = asm_vpmsumd (in2, H2m);
 	  Xm3 = asm_vpmsumd (in3, H0m);
 	  Xl = asm_vpmsumd (Xl, c2);
 
 	  Xl3 = asm_xor (Xl3, Xl1);
 	  Xh3 = asm_xor (Xh3, Xh1);
 	  Xh = asm_xor (Xh, in0);
 	  Xm2 = asm_xor (Xm2, Xm1);
 	  Xh = asm_xor (Xh, Xl_rotate);
 	  Xm3 = asm_xor (Xm3, Xm2);
 	  Xh = asm_xor (Xh, Xl);
 	}
 
       Xl = asm_vpmsumd (Xh, H4l);
       Xm = asm_vpmsumd (Xh, H4m);
       Xh = asm_vpmsumd (Xh, H4h);
 
       Xl = asm_xor (Xl, Xl3);
       Xm = asm_xor (Xm, Xm3);
 
       t0 = asm_vpmsumd (Xl, c2);
 
       Xh = asm_xor (Xh, Xh3);
       t1 = asm_rot_block_left (Xm);
       t2 = asm_rot_block_right (Xm);
       Xl = asm_xor (Xl, t1);
       Xh = asm_xor (Xh, t2);
 
       Xl = asm_swap_u64 (Xl);
       Xl = asm_xor (Xl, t0);
 
       Xl_rotate = asm_swap_u64 (Xl);
       Xl = asm_vpmsumd (Xl, c2);
       Xh = asm_xor (Xh, Xl_rotate);
       cur = asm_xor (Xh, Xl);
     }
 
   vec_store_he (vec_be_swap (cur, bswap_const), 0, result);
+
+  return 0;
 }
 
 #endif /* GCM_USE_PPC_VPMSUM */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 598ea5fb..4ce85408 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -1,1236 +1,1236 @@
 /* cipher-gcm.c  - Generic Galois Counter Mode implementation
  * Copyright (C) 2013 Dmitry Eremin-Solenikov
  * Copyright (C) 2013, 2018-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
 
 /* Helper macro to force alignment to 16 or 64 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
 # define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
 #else
 # define ATTR_ALIGNED_64
 #endif
 
 
 #ifdef GCM_USE_INTEL_PCLMUL
 extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
 
 extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
                                               const byte *buf, size_t nblocks);
 #endif
 
 #ifdef GCM_USE_ARM_PMULL
 extern void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
 
 extern unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
                                                 const byte *buf, size_t nblocks,
                                                 void *gcm_table);
 
 static void
 ghash_setup_armv8_ce_pmull (gcry_cipher_hd_t c)
 {
   _gcry_ghash_setup_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key,
                                    c->u_mode.gcm.gcm_table);
 }
 
 static unsigned int
 ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
                       size_t nblocks)
 {
   return _gcry_ghash_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result, buf,
                                     nblocks, c->u_mode.gcm.gcm_table);
 }
 #endif /* GCM_USE_ARM_PMULL */
 
 #ifdef GCM_USE_ARM_NEON
 extern void _gcry_ghash_setup_armv7_neon (void *gcm_key);
 
 extern unsigned int _gcry_ghash_armv7_neon (void *gcm_key, byte *result,
 					    const byte *buf, size_t nblocks);
 
 static void
 ghash_setup_armv7_neon (gcry_cipher_hd_t c)
 {
   _gcry_ghash_setup_armv7_neon(c->u_mode.gcm.u_ghash_key.key);
 }
 
 static unsigned int
 ghash_armv7_neon (gcry_cipher_hd_t c, byte *result, const byte *buf,
 		  size_t nblocks)
 {
   return _gcry_ghash_armv7_neon(c->u_mode.gcm.u_ghash_key.key, result, buf,
 				nblocks);
 }
 #endif /* GCM_USE_ARM_NEON */
 
 #ifdef GCM_USE_S390X_CRYPTO
 #include "asm-inline-s390x.h"
 
 static unsigned int
 ghash_s390x_kimd (gcry_cipher_hd_t c, byte *result, const byte *buf,
 		  size_t nblocks)
 {
   u128_t params[2];
 
   memcpy (&params[0], result, 16);
   memcpy (&params[1], c->u_mode.gcm.u_ghash_key.key, 16);
 
   kimd_execute (KMID_FUNCTION_GHASH, &params, buf, nblocks * 16);
 
   memcpy (result, &params[0], 16);
   wipememory (params, sizeof(params));
   return 0;
 }
 #endif /* GCM_USE_S390X_CRYPTO*/
 
 #ifdef GCM_USE_PPC_VPMSUM
 extern void _gcry_ghash_setup_ppc_vpmsum (void *gcm_table, void *gcm_key);
 
 /* result is 128-bits */
 extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
 					    const byte *buf, size_t nblocks);
 
 static void
 ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c)
 {
-  _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, c->u_mode.gcm.u_ghash_key.key);
+  _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table,
+			       c->u_mode.gcm.u_ghash_key.key);
 }
 
 static unsigned int
 ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf,
 		  size_t nblocks)
 {
-  _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf,
-			 nblocks);
-  return 0;
+  return _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf,
+				nblocks);
 }
 #endif /* GCM_USE_PPC_VPMSUM */
 
 #ifdef GCM_USE_TABLES
 static struct
 {
   volatile u32 counter_head;
   u32 cacheline_align[64 / 4 - 1];
   u16 R[256];
   volatile u32 counter_tail;
 } gcm_table ATTR_ALIGNED_64 =
   {
     0,
     { 0, },
     {
       0x0000, 0x01c2, 0x0384, 0x0246, 0x0708, 0x06ca, 0x048c, 0x054e,
       0x0e10, 0x0fd2, 0x0d94, 0x0c56, 0x0918, 0x08da, 0x0a9c, 0x0b5e,
       0x1c20, 0x1de2, 0x1fa4, 0x1e66, 0x1b28, 0x1aea, 0x18ac, 0x196e,
       0x1230, 0x13f2, 0x11b4, 0x1076, 0x1538, 0x14fa, 0x16bc, 0x177e,
       0x3840, 0x3982, 0x3bc4, 0x3a06, 0x3f48, 0x3e8a, 0x3ccc, 0x3d0e,
       0x3650, 0x3792, 0x35d4, 0x3416, 0x3158, 0x309a, 0x32dc, 0x331e,
       0x2460, 0x25a2, 0x27e4, 0x2626, 0x2368, 0x22aa, 0x20ec, 0x212e,
       0x2a70, 0x2bb2, 0x29f4, 0x2836, 0x2d78, 0x2cba, 0x2efc, 0x2f3e,
       0x7080, 0x7142, 0x7304, 0x72c6, 0x7788, 0x764a, 0x740c, 0x75ce,
       0x7e90, 0x7f52, 0x7d14, 0x7cd6, 0x7998, 0x785a, 0x7a1c, 0x7bde,
       0x6ca0, 0x6d62, 0x6f24, 0x6ee6, 0x6ba8, 0x6a6a, 0x682c, 0x69ee,
       0x62b0, 0x6372, 0x6134, 0x60f6, 0x65b8, 0x647a, 0x663c, 0x67fe,
       0x48c0, 0x4902, 0x4b44, 0x4a86, 0x4fc8, 0x4e0a, 0x4c4c, 0x4d8e,
       0x46d0, 0x4712, 0x4554, 0x4496, 0x41d8, 0x401a, 0x425c, 0x439e,
       0x54e0, 0x5522, 0x5764, 0x56a6, 0x53e8, 0x522a, 0x506c, 0x51ae,
       0x5af0, 0x5b32, 0x5974, 0x58b6, 0x5df8, 0x5c3a, 0x5e7c, 0x5fbe,
       0xe100, 0xe0c2, 0xe284, 0xe346, 0xe608, 0xe7ca, 0xe58c, 0xe44e,
       0xef10, 0xeed2, 0xec94, 0xed56, 0xe818, 0xe9da, 0xeb9c, 0xea5e,
       0xfd20, 0xfce2, 0xfea4, 0xff66, 0xfa28, 0xfbea, 0xf9ac, 0xf86e,
       0xf330, 0xf2f2, 0xf0b4, 0xf176, 0xf438, 0xf5fa, 0xf7bc, 0xf67e,
       0xd940, 0xd882, 0xdac4, 0xdb06, 0xde48, 0xdf8a, 0xddcc, 0xdc0e,
       0xd750, 0xd692, 0xd4d4, 0xd516, 0xd058, 0xd19a, 0xd3dc, 0xd21e,
       0xc560, 0xc4a2, 0xc6e4, 0xc726, 0xc268, 0xc3aa, 0xc1ec, 0xc02e,
       0xcb70, 0xcab2, 0xc8f4, 0xc936, 0xcc78, 0xcdba, 0xcffc, 0xce3e,
       0x9180, 0x9042, 0x9204, 0x93c6, 0x9688, 0x974a, 0x950c, 0x94ce,
       0x9f90, 0x9e52, 0x9c14, 0x9dd6, 0x9898, 0x995a, 0x9b1c, 0x9ade,
       0x8da0, 0x8c62, 0x8e24, 0x8fe6, 0x8aa8, 0x8b6a, 0x892c, 0x88ee,
       0x83b0, 0x8272, 0x8034, 0x81f6, 0x84b8, 0x857a, 0x873c, 0x86fe,
       0xa9c0, 0xa802, 0xaa44, 0xab86, 0xaec8, 0xaf0a, 0xad4c, 0xac8e,
       0xa7d0, 0xa612, 0xa454, 0xa596, 0xa0d8, 0xa11a, 0xa35c, 0xa29e,
       0xb5e0, 0xb422, 0xb664, 0xb7a6, 0xb2e8, 0xb32a, 0xb16c, 0xb0ae,
       0xbbf0, 0xba32, 0xb874, 0xb9b6, 0xbcf8, 0xbd3a, 0xbf7c, 0xbebe,
     },
     0
   };
 
 #define gcmR gcm_table.R
 
 static inline
 void prefetch_table(const void *tab, size_t len)
 {
   const volatile byte *vtab = tab;
   size_t i;
 
   for (i = 0; len - i >= 8 * 32; i += 8 * 32)
     {
       (void)vtab[i + 0 * 32];
       (void)vtab[i + 1 * 32];
       (void)vtab[i + 2 * 32];
       (void)vtab[i + 3 * 32];
       (void)vtab[i + 4 * 32];
       (void)vtab[i + 5 * 32];
       (void)vtab[i + 6 * 32];
       (void)vtab[i + 7 * 32];
     }
   for (; i < len; i += 32)
     {
       (void)vtab[i];
     }
 
   (void)vtab[len - 1];
 }
 
 static inline void
 do_prefetch_tables (const void *gcmM, size_t gcmM_size)
 {
   /* Modify counters to trigger copy-on-write and unsharing if physical pages
    * of look-up table are shared between processes.  Modifying counters also
    * causes checksums for pages to change and hint same-page merging algorithm
    * that these pages are frequently changing.  */
   gcm_table.counter_head++;
   gcm_table.counter_tail++;
 
   /* Prefetch look-up tables to cache.  */
   prefetch_table(gcmM, gcmM_size);
   prefetch_table(&gcm_table, sizeof(gcm_table));
 }
 
 #ifdef GCM_TABLES_USE_U64
 static void
 bshift (u64 * b0, u64 * b1)
 {
   u64 t[2], mask;
 
   t[0] = *b0;
   t[1] = *b1;
   mask = -(t[1] & 1) & 0xe1;
   mask <<= 56;
 
   *b1 = (t[1] >> 1) ^ (t[0] << 63);
   *b0 = (t[0] >> 1) ^ mask;
 }
 
 static void
 do_fillM (unsigned char *h, u64 *M)
 {
   int i, j;
 
   M[0 + 0] = 0;
   M[0 + 16] = 0;
 
   M[8 + 0] = buf_get_be64 (h + 0);
   M[8 + 16] = buf_get_be64 (h + 8);
 
   for (i = 4; i > 0; i /= 2)
     {
       M[i + 0] = M[2 * i + 0];
       M[i + 16] = M[2 * i + 16];
 
       bshift (&M[i], &M[i + 16]);
     }
 
   for (i = 2; i < 16; i *= 2)
     for (j = 1; j < i; j++)
       {
         M[(i + j) + 0] = M[i + 0] ^ M[j + 0];
         M[(i + j) + 16] = M[i + 16] ^ M[j + 16];
       }
 
   for (i = 0; i < 16; i++)
     {
       M[i + 32] = (M[i + 0] >> 4) ^ ((u64) gcmR[(M[i + 16] & 0xf) << 4] << 48);
       M[i + 48] = (M[i + 16] >> 4) ^ (M[i + 0] << 60);
     }
 }
 
 static inline unsigned int
 do_ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM)
 {
   u64 V[2];
   u64 tmp[2];
   const u64 *M;
   u64 T;
   u32 A;
   int i;
 
   cipher_block_xor (V, result, buf, 16);
   V[0] = be_bswap64 (V[0]);
   V[1] = be_bswap64 (V[1]);
 
   /* First round can be manually tweaked based on fact that 'tmp' is zero. */
   M = &gcmM[(V[1] & 0xf) + 32];
   V[1] >>= 4;
   tmp[0] = M[0];
   tmp[1] = M[16];
   tmp[0] ^= gcmM[(V[1] & 0xf) + 0];
   tmp[1] ^= gcmM[(V[1] & 0xf) + 16];
   V[1] >>= 4;
 
   i = 6;
   while (1)
     {
       M = &gcmM[(V[1] & 0xf) + 32];
       V[1] >>= 4;
 
       A = tmp[1] & 0xff;
       T = tmp[0];
       tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[1] & 0xf) + 0];
       tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[1] & 0xf) + 16];
 
       tmp[0] ^= M[0];
       tmp[1] ^= M[16];
 
       if (i == 0)
         break;
 
       V[1] >>= 4;
       --i;
     }
 
   i = 7;
   while (1)
     {
       M = &gcmM[(V[0] & 0xf) + 32];
       V[0] >>= 4;
 
       A = tmp[1] & 0xff;
       T = tmp[0];
       tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[0] & 0xf) + 0];
       tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[0] & 0xf) + 16];
 
       tmp[0] ^= M[0];
       tmp[1] ^= M[16];
 
       if (i == 0)
         break;
 
       V[0] >>= 4;
       --i;
     }
 
   buf_put_be64 (result + 0, tmp[0]);
   buf_put_be64 (result + 8, tmp[1]);
 
   return (sizeof(V) + sizeof(T) + sizeof(tmp) +
           sizeof(int)*2 + sizeof(void*)*5);
 }
 
 #else /*!GCM_TABLES_USE_U64*/
 
 static void
 bshift (u32 * M, int i)
 {
   u32 t[4], mask;
 
   t[0] = M[i * 4 + 0];
   t[1] = M[i * 4 + 1];
   t[2] = M[i * 4 + 2];
   t[3] = M[i * 4 + 3];
   mask = -(t[3] & 1) & 0xe1;
 
   M[i * 4 + 3] = (t[3] >> 1) ^ (t[2] << 31);
   M[i * 4 + 2] = (t[2] >> 1) ^ (t[1] << 31);
   M[i * 4 + 1] = (t[1] >> 1) ^ (t[0] << 31);
   M[i * 4 + 0] = (t[0] >> 1) ^ (mask << 24);
 }
 
 static void
 do_fillM (unsigned char *h, u32 *M)
 {
   int i, j;
 
   M[0 * 4 + 0] = 0;
   M[0 * 4 + 1] = 0;
   M[0 * 4 + 2] = 0;
   M[0 * 4 + 3] = 0;
 
   M[8 * 4 + 0] = buf_get_be32 (h + 0);
   M[8 * 4 + 1] = buf_get_be32 (h + 4);
   M[8 * 4 + 2] = buf_get_be32 (h + 8);
   M[8 * 4 + 3] = buf_get_be32 (h + 12);
 
   for (i = 4; i > 0; i /= 2)
     {
       M[i * 4 + 0] = M[2 * i * 4 + 0];
       M[i * 4 + 1] = M[2 * i * 4 + 1];
       M[i * 4 + 2] = M[2 * i * 4 + 2];
       M[i * 4 + 3] = M[2 * i * 4 + 3];
 
       bshift (M, i);
     }
 
   for (i = 2; i < 16; i *= 2)
     for (j = 1; j < i; j++)
       {
         M[(i + j) * 4 + 0] = M[i * 4 + 0] ^ M[j * 4 + 0];
         M[(i + j) * 4 + 1] = M[i * 4 + 1] ^ M[j * 4 + 1];
         M[(i + j) * 4 + 2] = M[i * 4 + 2] ^ M[j * 4 + 2];
         M[(i + j) * 4 + 3] = M[i * 4 + 3] ^ M[j * 4 + 3];
       }
 
   for (i = 0; i < 4 * 16; i += 4)
     {
       M[i + 0 + 64] = (M[i + 0] >> 4)
                       ^ ((u64) gcmR[(M[i + 3] << 4) & 0xf0] << 16);
       M[i + 1 + 64] = (M[i + 1] >> 4) ^ (M[i + 0] << 28);
       M[i + 2 + 64] = (M[i + 2] >> 4) ^ (M[i + 1] << 28);
       M[i + 3 + 64] = (M[i + 3] >> 4) ^ (M[i + 2] << 28);
     }
 }
 
 static inline unsigned int
 do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM)
 {
   byte V[16];
   u32 tmp[4];
   u32 v;
   const u32 *M, *m;
   u32 T[3];
   int i;
 
   cipher_block_xor (V, result, buf, 16); /* V is big-endian */
 
   /* First round can be manually tweaked based on fact that 'tmp' is zero. */
   i = 15;
 
   v = V[i];
   M = &gcmM[(v & 0xf) * 4 + 64];
   v = (v & 0xf0) >> 4;
   m = &gcmM[v * 4];
   v = V[--i];
 
   tmp[0] = M[0] ^ m[0];
   tmp[1] = M[1] ^ m[1];
   tmp[2] = M[2] ^ m[2];
   tmp[3] = M[3] ^ m[3];
 
   while (1)
     {
       M = &gcmM[(v & 0xf) * 4 + 64];
       v = (v & 0xf0) >> 4;
       m = &gcmM[v * 4];
 
       T[0] = tmp[0];
       T[1] = tmp[1];
       T[2] = tmp[2];
       tmp[0] = (T[0] >> 8) ^ ((u32) gcmR[tmp[3] & 0xff] << 16) ^ m[0];
       tmp[1] = (T[0] << 24) ^ (tmp[1] >> 8) ^ m[1];
       tmp[2] = (T[1] << 24) ^ (tmp[2] >> 8) ^ m[2];
       tmp[3] = (T[2] << 24) ^ (tmp[3] >> 8) ^ m[3];
 
       tmp[0] ^= M[0];
       tmp[1] ^= M[1];
       tmp[2] ^= M[2];
       tmp[3] ^= M[3];
 
       if (i == 0)
         break;
 
       v = V[--i];
     }
 
   buf_put_be32 (result + 0, tmp[0]);
   buf_put_be32 (result + 4, tmp[1]);
   buf_put_be32 (result + 8, tmp[2]);
   buf_put_be32 (result + 12, tmp[3]);
 
   return (sizeof(V) + sizeof(T) + sizeof(tmp) +
           sizeof(int)*2 + sizeof(void*)*6);
 }
 #endif /*!GCM_TABLES_USE_U64*/
 
 #define fillM(c) \
   do_fillM (c->u_mode.gcm.u_ghash_key.key, c->u_mode.gcm.gcm_table)
 #define GHASH(c, result, buf) do_ghash (result, buf, c->u_mode.gcm.gcm_table)
 #define prefetch_tables(c) \
   do_prefetch_tables(c->u_mode.gcm.gcm_table, sizeof(c->u_mode.gcm.gcm_table))
 
 #else
 
 static unsigned long
 bshift (unsigned long *b)
 {
   unsigned long c;
   int i;
   c = b[3] & 1;
   for (i = 3; i > 0; i--)
     {
       b[i] = (b[i] >> 1) | (b[i - 1] << 31);
     }
   b[i] >>= 1;
   return c;
 }
 
 static unsigned int
 do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
 {
   unsigned long V[4];
   int i, j;
   byte *p;
 
 #ifdef WORDS_BIGENDIAN
   p = result;
 #else
   unsigned long T[4];
 
   cipher_block_xor (V, result, buf, 16);
   for (i = 0; i < 4; i++)
     {
       V[i] = (V[i] & 0x00ff00ff) << 8 | (V[i] & 0xff00ff00) >> 8;
       V[i] = (V[i] & 0x0000ffff) << 16 | (V[i] & 0xffff0000) >> 16;
     }
   p = (byte *) T;
 #endif
 
   memset (p, 0, 16);
 
   for (i = 0; i < 16; i++)
     {
       for (j = 0x80; j; j >>= 1)
         {
           if (hsub[i] & j)
             cipher_block_xor (p, p, V, 16);
           if (bshift (V))
             V[0] ^= 0xe1000000;
         }
     }
 #ifndef WORDS_BIGENDIAN
   for (i = 0, p = (byte *) T; i < 16; i += 4, p += 4)
     {
       result[i + 0] = p[3];
       result[i + 1] = p[2];
       result[i + 2] = p[1];
       result[i + 3] = p[0];
     }
 #endif
 
   return (sizeof(V) + sizeof(T) + sizeof(int)*2 + sizeof(void*)*5);
 }
 
 #define fillM(c) do { } while (0)
 #define GHASH(c, result, buf) do_ghash (c->u_mode.gcm.u_ghash_key.key, result, buf)
 #define prefetch_tables(c) do {} while (0)
 
 #endif /* !GCM_USE_TABLES */
 
 
 static unsigned int
 ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 size_t nblocks)
 {
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   unsigned int burn = 0;
 
   prefetch_tables (c);
 
   while (nblocks)
     {
       burn = GHASH (c, result, buf);
       buf += blocksize;
       nblocks--;
     }
 
   return burn + (burn ? 5*sizeof(void*) : 0);
 }
 
 
 static void
 setupM (gcry_cipher_hd_t c)
 {
 #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \
     defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM)
   unsigned int features = _gcry_get_hw_features ();
 #endif
 
   c->u_mode.gcm.ghash_fn = NULL;
 
   if (0)
     ;
 #ifdef GCM_USE_INTEL_PCLMUL
   else if (features & HWF_INTEL_PCLMUL)
     {
       c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
       _gcry_ghash_setup_intel_pclmul (c);
     }
 #endif
 #ifdef GCM_USE_ARM_PMULL
   else if (features & HWF_ARM_PMULL)
     {
       c->u_mode.gcm.ghash_fn = ghash_armv8_ce_pmull;
       ghash_setup_armv8_ce_pmull (c);
     }
 #endif
 #ifdef GCM_USE_ARM_NEON
   else if (features & HWF_ARM_NEON)
     {
       c->u_mode.gcm.ghash_fn = ghash_armv7_neon;
       ghash_setup_armv7_neon (c);
     }
 #endif
 #ifdef GCM_USE_PPC_VPMSUM
   else if (features & HWF_PPC_VCRYPTO)
     {
       c->u_mode.gcm.ghash_fn = ghash_ppc_vpmsum;
       ghash_setup_ppc_vpmsum (c);
     }
 #endif
 #ifdef GCM_USE_S390X_CRYPTO
   else if (features & HWF_S390X_MSA)
     {
       if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH))
 	{
 	  c->u_mode.gcm.ghash_fn = ghash_s390x_kimd;
 	}
     }
 #endif
 
   if (c->u_mode.gcm.ghash_fn == NULL)
     {
       c->u_mode.gcm.ghash_fn = ghash_internal;
       fillM (c);
     }
 }
 
 
 static inline void
 gcm_bytecounter_add (u32 ctr[2], size_t add)
 {
   if (sizeof(add) > sizeof(u32))
     {
       u32 high_add = ((add >> 31) >> 1) & 0xffffffff;
       ctr[1] += high_add;
     }
 
   ctr[0] += add;
   if (ctr[0] >= add)
     return;
   ++ctr[1];
 }
 
 
 static inline u32
 gcm_add32_be128 (byte *ctr, unsigned int add)
 {
   /* 'ctr' must be aligned to four bytes. */
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   u32 *pval = (u32 *)(void *)(ctr + blocksize - sizeof(u32));
   u32 val;
 
   val = be_bswap32(*pval) + add;
   *pval = be_bswap32(val);
 
   return val; /* return result as host-endian value */
 }
 
 
 static inline int
 gcm_check_datalen (u32 ctr[2])
 {
   /* len(plaintext) <= 2^39-256 bits == 2^36-32 bytes == 2^32-2 blocks */
   if (ctr[1] > 0xfU)
     return 0;
   if (ctr[1] < 0xfU)
     return 1;
 
   if (ctr[0] <= 0xffffffe0U)
     return 1;
 
   return 0;
 }
 
 
 static inline int
 gcm_check_aadlen_or_ivlen (u32 ctr[2])
 {
   /* len(aad/iv) <= 2^64-1 bits ~= 2^61-1 bytes */
   if (ctr[1] > 0x1fffffffU)
     return 0;
   if (ctr[1] < 0x1fffffffU)
     return 1;
 
   if (ctr[0] <= 0xffffffffU)
     return 1;
 
   return 0;
 }
 
 
 static void
 do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
              size_t buflen, int do_padding)
 {
   unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   unsigned int unused = c->u_mode.gcm.mac_unused;
   ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
   size_t nblocks, n;
   unsigned int burn = 0;
 
   if (buflen == 0 && (unused == 0 || !do_padding))
     return;
 
   do
     {
       if (buflen > 0 && (buflen + unused < blocksize || unused > 0))
         {
           n = blocksize - unused;
           n = n < buflen ? n : buflen;
 
           buf_cpy (&c->u_mode.gcm.macbuf[unused], buf, n);
 
           unused += n;
           buf += n;
           buflen -= n;
         }
       if (!buflen)
         {
           if (!do_padding && unused < blocksize)
 	    {
 	      break;
 	    }
 
 	  n = blocksize - unused;
 	  if (n > 0)
 	    {
 	      memset (&c->u_mode.gcm.macbuf[unused], 0, n);
 	      unused = blocksize;
 	    }
         }
 
       if (unused > 0)
         {
           gcry_assert (unused == blocksize);
 
           /* Process one block from macbuf.  */
           burn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
           unused = 0;
         }
 
       nblocks = buflen / blocksize;
 
       if (nblocks)
         {
           burn = ghash_fn (c, hash, buf, nblocks);
           buf += blocksize * nblocks;
           buflen -= blocksize * nblocks;
         }
     }
   while (buflen > 0);
 
   c->u_mode.gcm.mac_unused = unused;
 
   if (burn)
     _gcry_burn_stack (burn);
 }
 
 
 static gcry_err_code_t
 gcm_ctr_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
                  const byte *inbuf, size_t inbuflen)
 {
   gcry_err_code_t err = 0;
 
   while (inbuflen)
     {
       u32 nblocks_to_overflow;
       u32 num_ctr_increments;
       u32 curr_ctr_low;
       size_t currlen = inbuflen;
       byte ctr_copy[GCRY_GCM_BLOCK_LEN];
       int fix_ctr = 0;
 
       /* GCM CTR increments only least significant 32-bits, without carry
        * to upper 96-bits of counter.  Using generic CTR implementation
        * directly would carry 32-bit overflow to upper 96-bit.  Detect
        * if input length is long enough to cause overflow, and limit
        * input length so that CTR overflow happen but updated CTR value is
        * not used to encrypt further input.  After overflow, upper 96 bits
        * of CTR are restored to cancel out modification done by generic CTR
        * encryption. */
 
       if (inbuflen > c->unused)
         {
           curr_ctr_low = gcm_add32_be128 (c->u_ctr.ctr, 0);
 
           /* Number of CTR increments this inbuflen would cause. */
           num_ctr_increments = (inbuflen - c->unused) / GCRY_GCM_BLOCK_LEN +
                                !!((inbuflen - c->unused) % GCRY_GCM_BLOCK_LEN);
 
           if ((u32)(num_ctr_increments + curr_ctr_low) < curr_ctr_low)
             {
               nblocks_to_overflow = 0xffffffffU - curr_ctr_low + 1;
               currlen = nblocks_to_overflow * GCRY_GCM_BLOCK_LEN + c->unused;
               if (currlen > inbuflen)
                 {
                   currlen = inbuflen;
                 }
 
               fix_ctr = 1;
               cipher_block_cpy(ctr_copy, c->u_ctr.ctr, GCRY_GCM_BLOCK_LEN);
             }
         }
 
       err = _gcry_cipher_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
       if (err != 0)
         return err;
 
       if (fix_ctr)
         {
           /* Lower 32-bits of CTR should now be zero. */
           gcry_assert(gcm_add32_be128 (c->u_ctr.ctr, 0) == 0);
 
           /* Restore upper part of CTR. */
           buf_cpy(c->u_ctr.ctr, ctr_copy, GCRY_GCM_BLOCK_LEN - sizeof(u32));
 
           wipememory(ctr_copy, sizeof(ctr_copy));
         }
 
       inbuflen -= currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       outbuf += currlen;
     }
 
   return err;
 }
 
 
 static gcry_err_code_t
 gcm_crypt_inner (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
 		 const byte *inbuf, size_t inbuflen, int encrypt)
 {
   gcry_err_code_t err;
 
   while (inbuflen)
     {
       size_t currlen = inbuflen;
 
       /* Use a bulk method if available.  */
       if (c->bulk.gcm_crypt)
 	{
 	  /* Bulk method requires that there is no cached data. */
 	  if (inbuflen >= GCRY_GCM_BLOCK_LEN && c->u_mode.gcm.mac_unused == 0)
 	    {
 	      size_t nblks = inbuflen / GCRY_GCM_BLOCK_LEN;
 	      size_t nleft;
 	      size_t ndone;
 
 	      nleft = c->bulk.gcm_crypt (c, outbuf, inbuf, nblks, encrypt);
 	      ndone = nblks - nleft;
 
 	      inbuf += ndone * GCRY_GCM_BLOCK_LEN;
 	      outbuf += ndone * GCRY_GCM_BLOCK_LEN;
 	      inbuflen -= ndone * GCRY_GCM_BLOCK_LEN;
 	      outbuflen -= ndone * GCRY_GCM_BLOCK_LEN;
 
 	      if (inbuflen == 0)
 		break;
 
 	      currlen = inbuflen;
 	    }
 	  else if (c->u_mode.gcm.mac_unused > 0
 	           && inbuflen >= GCRY_GCM_BLOCK_LEN
 			  + (16 - c->u_mode.gcm.mac_unused))
 	    {
 	      /* Handle just enough data so that cache is depleted, and on
 	       * next loop iteration use bulk method. */
 	      currlen = 16 - c->u_mode.gcm.mac_unused;
 
 	      gcry_assert(currlen);
 	    }
 	}
 
       /* Since checksumming is done after/before encryption/decryption,
        * process input in 24KiB chunks to keep data loaded in L1 cache for
        * checksumming/decryption. */
       if (currlen > 24 * 1024)
 	currlen = 24 * 1024;
 
       if (!encrypt)
 	do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, currlen, 0);
 
       err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
       if (err != 0)
 	return err;
 
       if (encrypt)
 	do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, outbuf, currlen, 0);
 
       outbuf += currlen;
       inbuf += currlen;
       outbuflen -= currlen;
       inbuflen -= currlen;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
   static const unsigned char zerobuf[MAX_BLOCKSIZE];
 
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->marks.tag
       || c->u_mode.gcm.ghash_data_finalized
       || !c->u_mode.gcm.ghash_fn)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
 
   if (c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode)
     return GPG_ERR_INV_STATE;
 
   if (!c->u_mode.gcm.ghash_aad_finalized)
     {
       /* Start of encryption marks end of AAD stream. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
       c->u_mode.gcm.ghash_aad_finalized = 1;
     }
 
   gcm_bytecounter_add(c->u_mode.gcm.datalen, inbuflen);
   if (!gcm_check_datalen(c->u_mode.gcm.datalen))
     {
       c->u_mode.gcm.datalen_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   return gcm_crypt_inner (c, outbuf, outbuflen, inbuf, inbuflen, 1);
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
   static const unsigned char zerobuf[MAX_BLOCKSIZE];
 
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->marks.tag
       || c->u_mode.gcm.ghash_data_finalized
       || !c->u_mode.gcm.ghash_fn)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
 
   if (!c->u_mode.gcm.ghash_aad_finalized)
     {
       /* Start of decryption marks end of AAD stream. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
       c->u_mode.gcm.ghash_aad_finalized = 1;
     }
 
   gcm_bytecounter_add(c->u_mode.gcm.datalen, inbuflen);
   if (!gcm_check_datalen(c->u_mode.gcm.datalen))
     {
       c->u_mode.gcm.datalen_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   return gcm_crypt_inner (c, outbuf, outbuflen, inbuf, inbuflen, 0);
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
                                const byte * aadbuf, size_t aadbuflen)
 {
   static const unsigned char zerobuf[MAX_BLOCKSIZE];
 
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
   if (c->marks.tag
       || c->u_mode.gcm.ghash_aad_finalized
       || c->u_mode.gcm.ghash_data_finalized
       || !c->u_mode.gcm.ghash_fn)
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
     _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
 
   gcm_bytecounter_add(c->u_mode.gcm.aadlen, aadbuflen);
   if (!gcm_check_aadlen_or_ivlen(c->u_mode.gcm.aadlen))
     {
       c->u_mode.gcm.datalen_over_limits = 1;
       return GPG_ERR_INV_LENGTH;
     }
 
   do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, aadbuf, aadbuflen, 0);
 
   return 0;
 }
 
 
 void
 _gcry_cipher_gcm_setkey (gcry_cipher_hd_t c)
 {
   memset (c->u_mode.gcm.u_ghash_key.key, 0, GCRY_GCM_BLOCK_LEN);
 
   c->spec->encrypt (&c->context.c, c->u_mode.gcm.u_ghash_key.key,
                     c->u_mode.gcm.u_ghash_key.key);
   setupM (c);
 }
 
 
 static gcry_err_code_t
 _gcry_cipher_gcm_initiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
 {
   memset (c->u_mode.gcm.aadlen, 0, sizeof(c->u_mode.gcm.aadlen));
   memset (c->u_mode.gcm.datalen, 0, sizeof(c->u_mode.gcm.datalen));
   memset (c->u_mode.gcm.u_tag.tag, 0, GCRY_GCM_BLOCK_LEN);
   c->u_mode.gcm.datalen_over_limits = 0;
   c->u_mode.gcm.ghash_data_finalized = 0;
   c->u_mode.gcm.ghash_aad_finalized = 0;
 
   if (ivlen == 0)
     return GPG_ERR_INV_LENGTH;
 
   if (ivlen != GCRY_GCM_BLOCK_LEN - 4)
     {
       u32 iv_bytes[2] = {0, 0};
       u32 bitlengths[2][2];
 
       if (!c->u_mode.gcm.ghash_fn)
         return GPG_ERR_INV_STATE;
 
       memset(c->u_ctr.ctr, 0, GCRY_GCM_BLOCK_LEN);
 
       gcm_bytecounter_add(iv_bytes, ivlen);
       if (!gcm_check_aadlen_or_ivlen(iv_bytes))
         {
           c->u_mode.gcm.datalen_over_limits = 1;
           return GPG_ERR_INV_LENGTH;
         }
 
       do_ghash_buf(c, c->u_ctr.ctr, iv, ivlen, 1);
 
       /* iv length, 64-bit */
       bitlengths[1][1] = be_bswap32(iv_bytes[0] << 3);
       bitlengths[1][0] = be_bswap32((iv_bytes[0] >> 29) |
                                     (iv_bytes[1] << 3));
       /* zeros, 64-bit */
       bitlengths[0][1] = 0;
       bitlengths[0][0] = 0;
 
       do_ghash_buf(c, c->u_ctr.ctr, (byte*)bitlengths, GCRY_GCM_BLOCK_LEN, 1);
 
       wipememory (iv_bytes, sizeof iv_bytes);
       wipememory (bitlengths, sizeof bitlengths);
     }
   else
     {
       /* 96-bit IV is handled differently. */
       memcpy (c->u_ctr.ctr, iv, ivlen);
       c->u_ctr.ctr[12] = c->u_ctr.ctr[13] = c->u_ctr.ctr[14] = 0;
       c->u_ctr.ctr[15] = 1;
     }
 
   c->spec->encrypt (&c->context.c, c->u_mode.gcm.tagiv, c->u_ctr.ctr);
 
   gcm_add32_be128 (c->u_ctr.ctr, 1);
 
   c->unused = 0;
   c->marks.iv = 1;
   c->marks.tag = 0;
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
 {
   c->marks.iv = 0;
   c->marks.tag = 0;
   c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
 
   if (fips_mode ())
     {
       /* Direct invocation of GCM setiv in FIPS mode disables encryption. */
       c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 1;
     }
 
   return _gcry_cipher_gcm_initiv (c, iv, ivlen);
 }
 
 
 #if 0 && TODO
 void
 _gcry_cipher_gcm_geniv (gcry_cipher_hd_t c,
                         byte *ivout, size_t ivoutlen, const byte *nonce,
                         size_t noncelen)
 {
   /* nonce:    user provided part (might be null) */
   /* noncelen: check if proper length (if nonce not null) */
   /* ivout:    iv used to initialize gcm, output to user */
   /* ivoutlen: check correct size */
   byte iv[IVLEN];
 
   if (!ivout)
     return GPG_ERR_INV_ARG;
   if (ivoutlen != IVLEN)
     return GPG_ERR_INV_LENGTH;
   if (nonce != NULL && !is_nonce_ok_len(noncelen))
     return GPG_ERR_INV_ARG;
 
   gcm_generate_iv(iv, nonce, noncelen);
 
   c->marks.iv = 0;
   c->marks.tag = 0;
   c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
 
   _gcry_cipher_gcm_initiv (c, iv, IVLEN);
 
   buf_cpy(ivout, iv, IVLEN);
   wipememory(iv, sizeof(iv));
 }
 #endif
 
 
 static int
 is_tag_length_valid(size_t taglen)
 {
   switch (taglen)
     {
     /* Allowed tag lengths from NIST SP 800-38D.  */
     case 128 / 8: /* GCRY_GCM_BLOCK_LEN */
     case 120 / 8:
     case 112 / 8:
     case 104 / 8:
     case 96 / 8:
     case 64 / 8:
     case 32 / 8:
       return 1;
 
     default:
       return 0;
     }
 }
 
 static gcry_err_code_t
 _gcry_cipher_gcm_tag (gcry_cipher_hd_t c,
                       byte * outbuf, size_t outbuflen, int check)
 {
   if (!(is_tag_length_valid (outbuflen) || outbuflen >= GCRY_GCM_BLOCK_LEN))
     return GPG_ERR_INV_LENGTH;
   if (c->u_mode.gcm.datalen_over_limits)
     return GPG_ERR_INV_LENGTH;
 
   if (!c->marks.tag)
     {
       u32 bitlengths[2][2];
 
       if (!c->u_mode.gcm.ghash_fn)
         return GPG_ERR_INV_STATE;
 
       /* aad length */
       bitlengths[0][1] = be_bswap32(c->u_mode.gcm.aadlen[0] << 3);
       bitlengths[0][0] = be_bswap32((c->u_mode.gcm.aadlen[0] >> 29) |
                                     (c->u_mode.gcm.aadlen[1] << 3));
       /* data length */
       bitlengths[1][1] = be_bswap32(c->u_mode.gcm.datalen[0] << 3);
       bitlengths[1][0] = be_bswap32((c->u_mode.gcm.datalen[0] >> 29) |
                                     (c->u_mode.gcm.datalen[1] << 3));
 
       /* Finalize data-stream. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
       c->u_mode.gcm.ghash_aad_finalized = 1;
       c->u_mode.gcm.ghash_data_finalized = 1;
 
       /* Add bitlengths to tag. */
       do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, (byte*)bitlengths,
                    GCRY_GCM_BLOCK_LEN, 1);
       cipher_block_xor (c->u_mode.gcm.u_tag.tag, c->u_mode.gcm.tagiv,
                         c->u_mode.gcm.u_tag.tag, GCRY_GCM_BLOCK_LEN);
       c->marks.tag = 1;
 
       wipememory (bitlengths, sizeof (bitlengths));
       wipememory (c->u_mode.gcm.macbuf, GCRY_GCM_BLOCK_LEN);
       wipememory (c->u_mode.gcm.tagiv, GCRY_GCM_BLOCK_LEN);
       wipememory (c->u_mode.gcm.aadlen, sizeof (c->u_mode.gcm.aadlen));
       wipememory (c->u_mode.gcm.datalen, sizeof (c->u_mode.gcm.datalen));
     }
 
   if (!check)
     {
       if (outbuflen > GCRY_GCM_BLOCK_LEN)
         outbuflen = GCRY_GCM_BLOCK_LEN;
 
       /* NB: We already checked that OUTBUF is large enough to hold
        * the result or has valid truncated length.  */
       memcpy (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen);
     }
   else
     {
       /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
        * and thus we need to compare its length first.  */
       if (!is_tag_length_valid (outbuflen)
           || !buf_eq_const (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen))
         return GPG_ERR_CHECKSUM;
     }
 
   return 0;
 }
 
 
 gcry_err_code_t
 _gcry_cipher_gcm_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
                           size_t taglen)
 {
   /* Outputting authentication tag is part of encryption. */
   if (c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode)
     return GPG_ERR_INV_STATE;
 
   return _gcry_cipher_gcm_tag (c, outtag, taglen, 0);
 }
 
 gcry_err_code_t
 _gcry_cipher_gcm_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
                             size_t taglen)
 {
   return _gcry_cipher_gcm_tag (c, (unsigned char *) intag, taglen, 1);
 }
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 1d62b11e..0e4a90fc 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -1,821 +1,821 @@
 /* cipher-internal.h  - Internal defs for cipher.c
  * Copyright (C) 2011 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef G10_CIPHER_INTERNAL_H
 #define G10_CIPHER_INTERNAL_H
 
 #include "./poly1305-internal.h"
 
 
 /* The maximum supported size of a block in bytes.  */
 #define MAX_BLOCKSIZE 16
 
 /* The length for an OCB block.  Although OCB supports any block
    length it does not make sense to use a 64 bit blocklen (and cipher)
    because this reduces the security margin to an unacceptable state.
    Thus we require a cipher with 128 bit blocklength.  */
 #define OCB_BLOCK_LEN  (128/8)
 
 /* The size of the pre-computed L table for OCB.  This takes the same
    size as the table used for GCM and thus we don't save anything by
    not using such a table.  */
 #define OCB_L_TABLE_SIZE 16
 
 
 /* Check the above constants.  */
 #if OCB_BLOCK_LEN > MAX_BLOCKSIZE
 # error OCB_BLOCKLEN > MAX_BLOCKSIZE
 #endif
 
 
 
 /* Magic values for the context structure.  */
 #define CTX_MAGIC_NORMAL 0x24091964
 #define CTX_MAGIC_SECURE 0x46919042
 
 /* Try to use 16 byte aligned cipher context for better performance.
    We use the aligned attribute, thus it is only possible to implement
    this with gcc.  */
 #undef NEED_16BYTE_ALIGNED_CONTEXT
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
 # define NEED_16BYTE_ALIGNED_CONTEXT 1
 #endif
 
 /* Undef this symbol to trade GCM speed for 256 bytes of memory per context */
 #define GCM_USE_TABLES 1
 
 
 /* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL
    code.  */
 #undef GCM_USE_INTEL_PCLMUL
 #if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES)
 # if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
 #  if __GNUC__ >= 4
 #   define GCM_USE_INTEL_PCLMUL 1
 #  endif
 # endif
 #endif /* GCM_USE_INTEL_PCLMUL */
 
 /* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
 #undef GCM_USE_ARM_PMULL
 #if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES)
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
 #  define GCM_USE_ARM_PMULL 1
 # elif defined(__AARCH64EL__) && \
     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 #  define GCM_USE_ARM_PMULL 1
 # endif
 #endif /* GCM_USE_ARM_PMULL */
 
 /* GCM_USE_ARM_NEON indicates whether to compile GCM with ARMv7 NEON code. */
 #undef GCM_USE_ARM_NEON
 #if defined(GCM_USE_TABLES)
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define GCM_USE_ARM_NEON 1
 #endif
 #endif /* GCM_USE_ARM_NEON */
 
 /* GCM_USE_S390X_CRYPTO indicates whether to enable zSeries code. */
 #undef GCM_USE_S390X_CRYPTO
 #if defined(HAVE_GCC_INLINE_ASM_S390X)
 # define GCM_USE_S390X_CRYPTO 1
 #endif /* GCM_USE_S390X_CRYPTO */
 
 /* GCM_USE_PPC_VPMSUM indicates whether to compile GCM with PPC Power 8
  * polynomial multiplication instruction. */
 #undef GCM_USE_PPC_VPMSUM
 #if defined(GCM_USE_TABLES)
 #if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \
-    !defined(WORDS_BIGENDIAN) && defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4
 #  define GCM_USE_PPC_VPMSUM 1
 #  define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */
 #endif
 #endif /* GCM_USE_PPC_VPMSUM */
 
 typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
                                     const byte *buf, size_t nblocks);
 
 
 /* A structure with function pointers for mode operations. */
 typedef struct cipher_mode_ops
 {
   gcry_err_code_t (*encrypt)(gcry_cipher_hd_t c, unsigned char *outbuf,
 			     size_t outbuflen, const unsigned char *inbuf,
 			     size_t inbuflen);
   gcry_err_code_t (*decrypt)(gcry_cipher_hd_t c, unsigned char *outbuf,
 			     size_t outbuflen, const unsigned char *inbuf,
 			     size_t inbuflen);
   gcry_err_code_t (*setiv)(gcry_cipher_hd_t c, const unsigned char *iv,
 			   size_t ivlen);
 
   gcry_err_code_t (*authenticate)(gcry_cipher_hd_t c,
 				  const unsigned char *abuf, size_t abuflen);
   gcry_err_code_t (*get_tag)(gcry_cipher_hd_t c, unsigned char *outtag,
 			     size_t taglen);
   gcry_err_code_t (*check_tag)(gcry_cipher_hd_t c, const unsigned char *intag,
 			       size_t taglen);
 } cipher_mode_ops_t;
 
 
 /* A structure with function pointers for bulk operations.  The cipher
    algorithm setkey function initializes them when bulk operations are
    available and the actual encryption routines use them if they are
    not NULL.  */
 typedef struct cipher_bulk_ops
 {
   void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks);
   void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks);
   void (*cbc_enc)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks, int cbc_mac);
   void (*cbc_dec)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks);
   void (*ofb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks);
   void (*ctr_enc)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks);
   size_t (*ocb_crypt)(gcry_cipher_hd_t c, void *outbuf_arg,
 		      const void *inbuf_arg, size_t nblocks, int encrypt);
   size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks);
   void (*xts_crypt)(void *context, unsigned char *tweak, void *outbuf_arg,
 		    const void *inbuf_arg, size_t nblocks, int encrypt);
   size_t (*gcm_crypt)(gcry_cipher_hd_t c, void *outbuf_arg,
 		      const void *inbuf_arg, size_t nblocks, int encrypt);
 } cipher_bulk_ops_t;
 
 
 /* A VIA processor with the Padlock engine as well as the Intel AES_NI
    instructions require an alignment of most data on a 16 byte
    boundary.  Because we trick out the compiler while allocating the
    context, the align attribute as used in rijndael.c does not work on
    its own.  Thus we need to make sure that the entire context
    structure is a aligned on that boundary.  We achieve this by
    defining a new type and use that instead of our usual alignment
    type.  */
 typedef union
 {
   PROPERLY_ALIGNED_TYPE foo;
 #ifdef NEED_16BYTE_ALIGNED_CONTEXT
   char bar[16] __attribute__ ((aligned (16)));
 #endif
   char c[1];
 } cipher_context_alignment_t;
 
 
 /* Storage structure for CMAC, for CMAC and EAX modes. */
 typedef struct {
   /* The initialization vector. Also contains tag after finalization. */
   union {
     cipher_context_alignment_t iv_align;
     unsigned char iv[MAX_BLOCKSIZE];
   } u_iv;
 
   /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
   unsigned char subkeys[2][MAX_BLOCKSIZE];
 
   /* Space to save partial input lengths for MAC. */
   unsigned char macbuf[MAX_BLOCKSIZE];
 
   int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
   unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
 } gcry_cmac_context_t;
 
 
 /* The handle structure.  */
 struct gcry_cipher_handle
 {
   int magic;
   size_t actual_handle_size;     /* Allocated size of this handle. */
   size_t handle_offset;          /* Offset to the malloced block.  */
   gcry_cipher_spec_t *spec;
 
   /* The algorithm id.  This is a hack required because the module
      interface does not easily allow to retrieve this value. */
   int algo;
 
   /* A structure with function pointers for mode operations. */
   cipher_mode_ops_t mode_ops;
 
   /* A structure with function pointers for bulk operations.  Due to
      limitations of the module system (we don't want to change the
      API) we need to keep these function pointers here.  */
   cipher_bulk_ops_t bulk;
 
   int mode;
   unsigned int flags;
 
   struct {
     unsigned int key:1; /* Set to 1 if a key has been set.  */
     unsigned int iv:1;  /* Set to 1 if a IV has been set.  */
     unsigned int tag:1; /* Set to 1 if a tag is finalized. */
     unsigned int finalize:1; /* Next encrypt/decrypt has the final data.  */
     unsigned int allow_weak_key:1; /* Set to 1 if weak keys are allowed. */
   } marks;
 
   /* The initialization vector.  For best performance we make sure
      that it is properly aligned.  In particular some implementations
      of bulk operations expect an 16 byte aligned IV.  IV is also used
      to store CBC-MAC in CCM mode; counter IV is stored in U_CTR.  For
      OCB mode it is used for the offset value.  */
   union {
     cipher_context_alignment_t iv_align;
     unsigned char iv[MAX_BLOCKSIZE];
   } u_iv;
 
   /* The counter for CTR mode.  This field is also used by AESWRAP and
      thus we can't use the U_IV union.  For OCB mode it is used for
      the checksum.  */
   union {
     cipher_context_alignment_t iv_align;
     unsigned char ctr[MAX_BLOCKSIZE];
   } u_ctr;
 
   /* Space to save an IV or CTR for chaining operations.  */
   unsigned char lastiv[MAX_BLOCKSIZE];
   int unused;  /* Number of unused bytes in LASTIV. */
 
   union {
     /* Mode specific storage for CCM mode. */
     struct {
       u64 encryptlen;
       u64 aadlen;
       unsigned int authlen;
 
       /* Space to save partial input lengths for MAC. */
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
       unsigned char s0[GCRY_CCM_BLOCK_LEN];
 
       unsigned int nonce:1; /* Set to 1 if nonce has been set.  */
       unsigned int lengths:1; /* Set to 1 if CCM length parameters has been
                                  processed.  */
     } ccm;
 
     /* Mode specific storage for Poly1305 mode. */
     struct {
       /* byte counter for AAD. */
       u32 aadcount[2];
 
       /* byte counter for data. */
       u32 datacount[2];
 
       unsigned int aad_finalized:1;
       unsigned int bytecount_over_limits:1;
 
       poly1305_context_t ctx;
     } poly1305;
 
     /* Mode specific storage for CMAC mode. */
     gcry_cmac_context_t cmac;
 
     /* Mode specific storage for EAX mode. */
     struct {
       /* CMAC for header (AAD). */
       gcry_cmac_context_t cmac_header;
 
       /* CMAC for ciphertext. */
       gcry_cmac_context_t cmac_ciphertext;
     } eax;
 
     /* Mode specific storage for GCM mode. */
     struct {
       /* The interim tag for GCM mode.  */
       union {
         cipher_context_alignment_t iv_align;
         unsigned char tag[MAX_BLOCKSIZE];
       } u_tag;
 
       /* Space to save partial input lengths for MAC. */
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
       /* byte counters for GCM */
       u32 aadlen[2];
       u32 datalen[2];
 
       /* encrypted tag counter */
       unsigned char tagiv[MAX_BLOCKSIZE];
 
       unsigned int ghash_data_finalized:1;
       unsigned int ghash_aad_finalized:1;
 
       unsigned int datalen_over_limits:1;
       unsigned int disallow_encryption_because_of_setiv_in_fips_mode:1;
 
       /* --- Following members are not cleared in gcry_cipher_reset --- */
 
       /* GHASH multiplier from key.  */
       union {
         cipher_context_alignment_t iv_align;
         unsigned char key[MAX_BLOCKSIZE];
       } u_ghash_key;
 
       /* Pre-calculated table for GCM. */
 #ifdef GCM_USE_TABLES
  #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
       #define GCM_TABLES_USE_U64 1
       u64 gcm_table[4 * 16];
  #else
       #undef GCM_TABLES_USE_U64
       u32 gcm_table[8 * 16];
  #endif
 #endif
 
       /* GHASH implementation in use. */
       ghash_fn_t ghash_fn;
     } gcm;
 
     /* Mode specific storage for OCB mode. */
     struct {
       /* --- Following members are not cleared in gcry_cipher_reset --- */
 
       /* Helper variables and pre-computed table of L values.  */
       unsigned char L_star[OCB_BLOCK_LEN];
       unsigned char L_dollar[OCB_BLOCK_LEN];
       unsigned char L0L1[OCB_BLOCK_LEN];
       unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN];
 
       /* --- Following members are cleared in gcry_cipher_reset --- */
 
       /* The tag is valid if marks.tag has been set.  */
       unsigned char tag[OCB_BLOCK_LEN];
 
       /* A buffer to hold the offset for the AAD processing.  */
       unsigned char aad_offset[OCB_BLOCK_LEN];
 
       /* A buffer to hold the current sum of AAD processing.  We can't
          use tag here because tag may already hold the preprocessed
          checksum of the data.  */
       unsigned char aad_sum[OCB_BLOCK_LEN];
 
       /* A buffer to store AAD data not yet processed.  */
       unsigned char aad_leftover[OCB_BLOCK_LEN];
 
       /* Number of data/aad blocks processed so far.  */
       u64 data_nblocks;
       u64 aad_nblocks;
 
       /* Number of valid bytes in AAD_LEFTOVER.  */
       unsigned char aad_nleftover;
 
       /* Length of the tag.  Fixed for now but may eventually be
          specified using a set of gcry_cipher_flags.  */
       unsigned char taglen;
 
       /* Flags indicating that the final data/aad block has been
          processed.  */
       unsigned int data_finalized:1;
       unsigned int aad_finalized:1;
     } ocb;
 
     /* Mode specific storage for XTS mode. */
     struct {
       /* Pointer to tweak cipher context, allocated after actual
        * cipher context. */
       char *tweak_context;
     } xts;
   } u_mode;
 
   /* What follows are two contexts of the cipher in use.  The first
      one needs to be aligned well enough for the cipher operation
      whereas the second one is a copy created by cipher_setkey and
      used by cipher_reset.  That second copy has no need for proper
      aligment because it is only accessed by memcpy.  */
   cipher_context_alignment_t context;
 };
 
 
 /*-- cipher-cbc.c --*/
 gcry_err_code_t _gcry_cipher_cbc_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cbc_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cbc_cts_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cbc_cts_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 /*-- cipher-cfb.c --*/
 gcry_err_code_t _gcry_cipher_cfb_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cfb_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cfb8_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_cfb8_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 
 /*-- cipher-ofb.c --*/
 gcry_err_code_t _gcry_cipher_ofb_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 /*-- cipher-ctr.c --*/
 gcry_err_code_t _gcry_cipher_ctr_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 
 
 /*-- cipher-aeswrap.c --*/
 gcry_err_code_t _gcry_cipher_aeswrap_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    byte *outbuf, size_t outbuflen,
                    const byte *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_aeswrap_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    byte *outbuf, size_t outbuflen,
                    const byte *inbuf, size_t inbuflen);
 
 
 /*-- cipher-ccm.c --*/
 gcry_err_code_t _gcry_cipher_ccm_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ccm_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ccm_set_nonce
 /*           */ (gcry_cipher_hd_t c, const unsigned char *nonce,
                  size_t noncelen);
 gcry_err_code_t _gcry_cipher_ccm_authenticate
 /*           */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen);
 gcry_err_code_t _gcry_cipher_ccm_set_lengths
 /*           */ (gcry_cipher_hd_t c, u64 encryptedlen, u64 aadlen, u64 taglen);
 gcry_err_code_t _gcry_cipher_ccm_get_tag
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_ccm_check_tag
 /*           */ (gcry_cipher_hd_t c,
                  const unsigned char *intag, size_t taglen);
 
 
 /*-- cipher-cmac.c --*/
 gcry_err_code_t _gcry_cmac_generate_subkeys
 /*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
 gcry_err_code_t _gcry_cmac_write
 /*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
 		 const byte * inbuf, size_t inlen);
 gcry_err_code_t _gcry_cmac_final
 /*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
 void _gcry_cmac_reset (gcry_cmac_context_t *ctx);
 
 
 /*-- cipher-eax.c --*/
 gcry_err_code_t _gcry_cipher_eax_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_eax_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_eax_set_nonce
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *nonce, size_t noncelen);
 gcry_err_code_t _gcry_cipher_eax_authenticate
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *aadbuf, size_t aadbuflen);
 gcry_err_code_t _gcry_cipher_eax_get_tag
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_eax_check_tag
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *intag, size_t taglen);
 gcry_err_code_t _gcry_cipher_eax_setkey
 /*           */   (gcry_cipher_hd_t c);
 
 
 /*-- cipher-gcm.c --*/
 gcry_err_code_t _gcry_cipher_gcm_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_gcm_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_gcm_setiv
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *iv, size_t ivlen);
 gcry_err_code_t _gcry_cipher_gcm_authenticate
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *aadbuf, size_t aadbuflen);
 gcry_err_code_t _gcry_cipher_gcm_get_tag
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_gcm_check_tag
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *intag, size_t taglen);
 void _gcry_cipher_gcm_setkey
 /*           */   (gcry_cipher_hd_t c);
 
 
 /*-- cipher-poly1305.c --*/
 gcry_err_code_t _gcry_cipher_poly1305_encrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_poly1305_decrypt
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outbuf, size_t outbuflen,
                    const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_poly1305_setiv
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *iv, size_t ivlen);
 gcry_err_code_t _gcry_cipher_poly1305_authenticate
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *aadbuf, size_t aadbuflen);
 gcry_err_code_t _gcry_cipher_poly1305_get_tag
 /*           */   (gcry_cipher_hd_t c,
                    unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_poly1305_check_tag
 /*           */   (gcry_cipher_hd_t c,
                    const unsigned char *intag, size_t taglen);
 void _gcry_cipher_poly1305_setkey
 /*           */   (gcry_cipher_hd_t c);
 
 
 /*-- chacha20.c --*/
 gcry_err_code_t _gcry_chacha20_poly1305_encrypt
 /*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
 		   size_t length);
 gcry_err_code_t _gcry_chacha20_poly1305_decrypt
 /*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
 		   size_t length);
 
 
 /*-- cipher-ocb.c --*/
 gcry_err_code_t _gcry_cipher_ocb_encrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ocb_decrypt
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outbuf, size_t outbuflen,
                  const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_ocb_set_nonce
 /*           */ (gcry_cipher_hd_t c, const unsigned char *nonce,
                  size_t noncelen);
 gcry_err_code_t _gcry_cipher_ocb_authenticate
 /*           */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen);
 gcry_err_code_t _gcry_cipher_ocb_get_tag
 /*           */ (gcry_cipher_hd_t c,
                  unsigned char *outtag, size_t taglen);
 gcry_err_code_t _gcry_cipher_ocb_check_tag
 /*           */ (gcry_cipher_hd_t c,
                  const unsigned char *intag, size_t taglen);
 void _gcry_cipher_ocb_setkey
 /*           */ (gcry_cipher_hd_t c);
 
 
 /*-- cipher-xts.c --*/
 gcry_err_code_t _gcry_cipher_xts_encrypt
 /*           */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
 		 const unsigned char *inbuf, size_t inbuflen);
 gcry_err_code_t _gcry_cipher_xts_decrypt
 /*           */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
 		 const unsigned char *inbuf, size_t inbuflen);
 
 
 /* Return the L-value for block N.  Note: 'cipher_ocb.c' ensures that N
  * will never be multiple of 65536 (1 << OCB_L_TABLE_SIZE), thus N can
  * be directly passed to _gcry_ctz() function and resulting index will
  * never overflow the table.  */
 static inline const unsigned char *
 ocb_get_l (gcry_cipher_hd_t c, u64 n)
 {
   unsigned long ntz;
 
 #if ((defined(__i386__) || defined(__x86_64__)) && __GNUC__ >= 4)
   /* Assumes that N != 0. */
   asm ("rep;bsfl %k[low], %k[ntz]\n\t"
         : [ntz] "=r" (ntz)
         : [low] "r" ((unsigned long)n)
         : "cc");
 #else
   ntz = _gcry_ctz (n);
 #endif
 
   return c->u_mode.ocb.L[ntz];
 }
 
 
 /* Return bit-shift of blocksize. */
 static inline unsigned int _gcry_blocksize_shift(gcry_cipher_hd_t c)
 {
   /* Only blocksizes 8 and 16 are used. Return value in such way
    * that compiler can optimize calling functions based on this.  */
   return c->spec->blocksize == 8 ? 3 : 4;
 }
 
 
 /* Optimized function for adding value to cipher block. */
 static inline void
 cipher_block_add(void *_dstsrc, unsigned int add, size_t blocksize)
 {
   byte *dstsrc = _dstsrc;
   u64 s[2];
 
   if (blocksize == 8)
     {
       buf_put_be64(dstsrc + 0, buf_get_be64(dstsrc + 0) + add);
     }
   else /* blocksize == 16 */
     {
       s[0] = buf_get_be64(dstsrc + 8);
       s[1] = buf_get_be64(dstsrc + 0);
       s[0] += add;
       s[1] += (s[0] < add);
       buf_put_be64(dstsrc + 8, s[0]);
       buf_put_be64(dstsrc + 0, s[1]);
     }
 }
 
 
 /* Optimized function for cipher block copying */
 static inline void
 cipher_block_cpy(void *_dst, const void *_src, size_t blocksize)
 {
   byte *dst = _dst;
   const byte *src = _src;
   u64 s[2];
 
   if (blocksize == 8)
     {
       buf_put_he64(dst + 0, buf_get_he64(src + 0));
     }
   else /* blocksize == 16 */
     {
       s[0] = buf_get_he64(src + 0);
       s[1] = buf_get_he64(src + 8);
       buf_put_he64(dst + 0, s[0]);
       buf_put_he64(dst + 8, s[1]);
     }
 }
 
 
 /* Optimized function for cipher block xoring */
 static inline void
 cipher_block_xor(void *_dst, const void *_src1, const void *_src2,
                  size_t blocksize)
 {
   byte *dst = _dst;
   const byte *src1 = _src1;
   const byte *src2 = _src2;
   u64 s1[2];
   u64 s2[2];
 
   if (blocksize == 8)
     {
       buf_put_he64(dst + 0, buf_get_he64(src1 + 0) ^ buf_get_he64(src2 + 0));
     }
   else /* blocksize == 16 */
     {
       s1[0] = buf_get_he64(src1 + 0);
       s1[1] = buf_get_he64(src1 + 8);
       s2[0] = buf_get_he64(src2 + 0);
       s2[1] = buf_get_he64(src2 + 8);
       buf_put_he64(dst + 0, s1[0] ^ s2[0]);
       buf_put_he64(dst + 8, s1[1] ^ s2[1]);
     }
 }
 
 
 /* Optimized function for in-place cipher block xoring */
 static inline void
 cipher_block_xor_1(void *_dst, const void *_src, size_t blocksize)
 {
   cipher_block_xor (_dst, _dst, _src, blocksize);
 }
 
 
 /* Optimized function for cipher block xoring with two destination cipher
    blocks.  Used mainly by CFB mode encryption.  */
 static inline void
 cipher_block_xor_2dst(void *_dst1, void *_dst2, const void *_src,
                       size_t blocksize)
 {
   byte *dst1 = _dst1;
   byte *dst2 = _dst2;
   const byte *src = _src;
   u64 d2[2];
   u64 s[2];
 
   if (blocksize == 8)
     {
       d2[0] = buf_get_he64(dst2 + 0) ^ buf_get_he64(src + 0);
       buf_put_he64(dst2 + 0, d2[0]);
       buf_put_he64(dst1 + 0, d2[0]);
     }
   else /* blocksize == 16 */
     {
       s[0] = buf_get_he64(src + 0);
       s[1] = buf_get_he64(src + 8);
       d2[0] = buf_get_he64(dst2 + 0);
       d2[1] = buf_get_he64(dst2 + 8);
       d2[0] = d2[0] ^ s[0];
       d2[1] = d2[1] ^ s[1];
       buf_put_he64(dst2 + 0, d2[0]);
       buf_put_he64(dst2 + 8, d2[1]);
       buf_put_he64(dst1 + 0, d2[0]);
       buf_put_he64(dst1 + 8, d2[1]);
     }
 }
 
 
 /* Optimized function for combined cipher block xoring and copying.
    Used by mainly CBC mode decryption.  */
 static inline void
 cipher_block_xor_n_copy_2(void *_dst_xor, const void *_src_xor,
                           void *_srcdst_cpy, const void *_src_cpy,
                           size_t blocksize)
 {
   byte *dst_xor = _dst_xor;
   byte *srcdst_cpy = _srcdst_cpy;
   const byte *src_xor = _src_xor;
   const byte *src_cpy = _src_cpy;
   u64 sc[2];
   u64 sx[2];
   u64 sdc[2];
 
   if (blocksize == 8)
     {
       sc[0] = buf_get_he64(src_cpy + 0);
       buf_put_he64(dst_xor + 0,
                    buf_get_he64(srcdst_cpy + 0) ^ buf_get_he64(src_xor + 0));
       buf_put_he64(srcdst_cpy + 0, sc[0]);
     }
   else /* blocksize == 16 */
     {
       sc[0] = buf_get_he64(src_cpy + 0);
       sc[1] = buf_get_he64(src_cpy + 8);
       sx[0] = buf_get_he64(src_xor + 0);
       sx[1] = buf_get_he64(src_xor + 8);
       sdc[0] = buf_get_he64(srcdst_cpy + 0);
       sdc[1] = buf_get_he64(srcdst_cpy + 8);
       sx[0] ^= sdc[0];
       sx[1] ^= sdc[1];
       buf_put_he64(dst_xor + 0, sx[0]);
       buf_put_he64(dst_xor + 8, sx[1]);
       buf_put_he64(srcdst_cpy + 0, sc[0]);
       buf_put_he64(srcdst_cpy + 8, sc[1]);
     }
 }
 
 
 /* Optimized function for combined cipher block xoring and copying.
    Used by mainly CFB mode decryption.  */
 static inline void
 cipher_block_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src,
                         size_t blocksize)
 {
   cipher_block_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, blocksize);
 }
 
 
 #endif /*G10_CIPHER_INTERNAL_H*/