diff --git a/cipher/asm-inline-s390x.h b/cipher/asm-inline-s390x.h
index 82007531..9a7bf453 100644
--- a/cipher/asm-inline-s390x.h
+++ b/cipher/asm-inline-s390x.h
@@ -1,114 +1,139 @@
 /* asm-inline-s390x.h  -  Common macros for zSeries inline assembly
  *
  * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef GCRY_ASM_INLINE_S390X_H
 #define GCRY_ASM_INLINE_S390X_H
 
 #include <config.h>
 
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+
 typedef unsigned int u128_t __attribute__ ((mode (TI)));
 
 enum kmxx_functions_e
 {
   KM_FUNCTION_AES_128 = 18,
   KM_FUNCTION_AES_192 = 19,
   KM_FUNCTION_AES_256 = 20,
   KM_FUNCTION_XTS_AES_128 = 50,
   KM_FUNCTION_XTS_AES_256 = 52,
 
   KMID_FUNCTION_SHA1 = 1,
   KMID_FUNCTION_SHA256 = 2,
   KMID_FUNCTION_SHA512 = 3,
   KMID_FUNCTION_SHA3_224 = 32,
   KMID_FUNCTION_SHA3_256 = 33,
   KMID_FUNCTION_SHA3_384 = 34,
   KMID_FUNCTION_SHA3_512 = 35,
   KMID_FUNCTION_SHAKE128 = 36,
   KMID_FUNCTION_SHAKE256 = 37,
   KMID_FUNCTION_GHASH = 65,
 };
 
 enum kmxx_function_flags_e
 {
   KM_ENCRYPT  = 0 << 7,
   KM_DECRYPT  = 1 << 7,
 
   KMF_LCFB_16 = 16 << 24,
 
   KMA_LPC     = 1 << 8,
   KMA_LAAD    = 1 << 9,
   KMA_HS      = 1 << 10,
 };
 
-static inline u128_t km_function_to_mask(enum kmxx_functions_e func)
+static ALWAYS_INLINE u128_t km_function_to_mask(enum kmxx_functions_e func)
 {
   return (u128_t)1 << (127 - func);
 }
 
 static inline u128_t kimd_query(void)
 {
   static u128_t function_codes = 0;
   static int initialized = 0;
   register unsigned long reg0 asm("0") = 0;
   register void *reg1 asm("1") = &function_codes;
   u128_t r1;
 
   if (initialized)
     return function_codes;
 
   asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t"
 		"   brc 1,0b\n\t"
 		: [r1] "=a" (r1)
 		: [reg0] "r" (reg0), [reg1] "r" (reg1)
 		: "cc", "memory");
 
   initialized = 1;
   return function_codes;
 }
 
-static inline void kimd_execute(unsigned int func, void *param_block,
-				const void *src, size_t src_len)
+static inline u128_t klmd_query(void)
+{
+  static u128_t function_codes = 0;
+  static int initialized = 0;
+  register unsigned long reg0 asm("0") = 0;
+  register void *reg1 asm("1") = &function_codes;
+  u128_t r1;
+
+  if (initialized)
+    return function_codes;
+
+  asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t"
+		"   brc 1,0b\n\t"
+		: [r1] "=a" (r1)
+		: [reg0] "r" (reg0), [reg1] "r" (reg1)
+		: "cc", "memory");
+
+  initialized = 1;
+  return function_codes;
+}
+
+static ALWAYS_INLINE void
+kimd_execute(unsigned int func, void *param_block, const void *src,
+	     size_t src_len)
 {
   register unsigned long reg0 asm("0") = func;
   register byte *reg1 asm("1") = param_block;
   u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
 
   asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t"
 		"   brc 1,0b\n\t"
 		: [r1] "+a" (r1)
 		: [func] "r" (reg0), [param_ptr] "r" (reg1)
 		: "cc", "memory");
 }
 
-static inline void klmd_execute(unsigned int func, void *param_block,
-				const void *src, size_t src_len)
+static ALWAYS_INLINE void
+klmd_execute(unsigned int func, void *param_block, const void *src,
+	     size_t src_len)
 {
   register unsigned long reg0 asm("0") = func;
   register byte *reg1 asm("1") = param_block;
   u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
 
   asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t"
 		"   brc 1,0b\n\t"
 		: [r1] "+a" (r1)
 		: [func] "r" (reg0), [param_ptr] "r" (reg1)
 		: "cc", "memory");
 }
 
 #endif /* GCRY_ASM_INLINE_S390X_H */
diff --git a/cipher/rijndael-s390x.c b/cipher/rijndael-s390x.c
index 5ab019f9..aea65c5a 100644
--- a/cipher/rijndael-s390x.c
+++ b/cipher/rijndael-s390x.c
@@ -1,1156 +1,1155 @@
 /* Rijndael (AES) for GnuPG - s390x/zSeries AES implementation
  * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #include "rijndael-internal.h"
 #include "cipher-internal.h"
 #include "bufhelp.h"
 
 #ifdef USE_S390X_CRYPTO
 
 #include "asm-inline-s390x.h"
 
-#define ALWAYS_INLINE inline __attribute__((always_inline))
 #define NO_INLINE __attribute__((noinline))
 
 struct aes_s390x_gcm_params_s
 {
   u32 reserved[3];
   u32 counter_value;
   u64 tag[2];
   u64 hash_subkey[2];
   u64 total_aad_length;
   u64 total_cipher_length;
   u32 initial_counter_value[4];
   u64 key[4];
 };
 
 #define DECL_QUERY_FUNC(instruction, opcode) \
   static u128_t instruction ##_query(void) \
   { \
     static u128_t function_codes = 0; \
     static int initialized = 0; \
     register unsigned long reg0 asm("0") = 0; \
     register void *reg1 asm("1") = &function_codes; \
     u128_t r1, r2; \
     \
     if (initialized) \
       return function_codes; \
     \
     asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \
 		  "   brc 1,0b\n\t" \
 		  : [r1] "=a" (r1), [r2] "=a" (r2) \
 		  : [reg0] "r" (reg0), [reg1] "r" (reg1) \
 		  : "cc", "memory"); \
     \
     initialized = 1; \
     return function_codes; \
   }
 
 #define DECL_EXECUTE_FUNC(instruction, opcode, param_const) \
   static ALWAYS_INLINE size_t \
   instruction ##_execute(unsigned int func, param_const void *param_block, \
 			 void *dst, const void *src, size_t src_len) \
   { \
     register unsigned long reg0 asm("0") = func; \
     register param_const byte *reg1 asm("1") = param_block; \
     u128_t r1 = ((u128_t)(uintptr_t)dst << 64); \
     u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; \
     \
     asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \
 		  "   brc 1,0b\n\t" \
 		  : [r1] "+a" (r1), [r2] "+a" (r2) \
 		  : [func] "r" (reg0), [param_ptr] "r" (reg1) \
 		  : "cc", "memory"); \
     \
     return (u64)r2; \
   }
 
 DECL_QUERY_FUNC(km, 0xb92e);
 DECL_QUERY_FUNC(kmc, 0xb92f);
 DECL_QUERY_FUNC(kmac, 0xb91e);
 DECL_QUERY_FUNC(kmf, 0xb92a);
 DECL_QUERY_FUNC(kmo, 0xb92b);
 
 DECL_EXECUTE_FUNC(km, 0xb92e, const);
 DECL_EXECUTE_FUNC(kmc, 0xb92f, );
 DECL_EXECUTE_FUNC(kmac, 0xb91e, );
 DECL_EXECUTE_FUNC(kmf, 0xb92a, );
 DECL_EXECUTE_FUNC(kmo, 0xb92b, );
 
 static u128_t kma_query(void)
 {
   static u128_t function_codes = 0;
   static int initialized = 0;
   register unsigned long reg0 asm("0") = 0;
   register void *reg1 asm("1") = &function_codes;
   u128_t r1, r2, r3;
 
   if (initialized)
     return function_codes;
 
   asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
 		"   brc 1,0b\n\t"
 		: [r1] "=a" (r1), [r2] "=a" (r2), [r3] "=a" (r3)
 		: [reg0] "r" (reg0), [reg1] "r" (reg1)
 		: "cc", "memory");
 
   initialized = 1;
   return function_codes;
 }
 
 static ALWAYS_INLINE void
 kma_execute(unsigned int func, void *param_block, byte *dst, const byte *src,
 	    size_t src_len, const byte *aad, size_t aad_len)
 {
   register unsigned long reg0 asm("0") = func;
   register byte *reg1 asm("1") = param_block;
   u128_t r1 = ((u128_t)(uintptr_t)dst << 64);
   u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
   u128_t r3 = ((u128_t)(uintptr_t)aad << 64) | (u64)aad_len;
 
   asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
 		"   brc 1,0b\n\t"
 		: [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3),
 		  [func] "+r" (reg0)
 		: [param_ptr] "r" (reg1)
 		: "cc", "memory");
 }
 
 unsigned int _gcry_aes_s390x_encrypt(const RIJNDAEL_context *ctx,
 				     unsigned char *dst,
 				     const unsigned char *src)
 {
   km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, dst, src,
 	      BLOCKSIZE);
   return 0;
 }
 
 unsigned int _gcry_aes_s390x_decrypt(const RIJNDAEL_context *ctx,
 				     unsigned char *dst,
 				     const unsigned char *src)
 {
   km_execute (ctx->km_func | KM_DECRYPT, ctx->keyschenc, dst, src,
 	      BLOCKSIZE);
   return 0;
 }
 
 static void aes_s390x_cbc_enc(void *context, unsigned char *iv,
 			      void *outbuf_arg, const void *inbuf_arg,
 			      size_t nblocks, int cbc_mac)
 {
   RIJNDAEL_context *ctx = context;
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   u128_t params[3];
 
   /* Prepare parameter block. */
   memcpy (&params[0], iv, BLOCKSIZE);
   memcpy (&params[1], ctx->keyschenc, 32);
 
   if (cbc_mac)
     {
       kmac_execute (ctx->kmac_func | KM_ENCRYPT, &params, NULL, in,
 	            nblocks * BLOCKSIZE);
       memcpy (out, &params[0], BLOCKSIZE);
     }
   else
     {
       kmc_execute (ctx->kmc_func | KM_ENCRYPT, &params, out, in,
 	           nblocks * BLOCKSIZE);
     }
 
   /* Update IV with OCV. */
   memcpy (iv, &params[0], BLOCKSIZE);
 
   wipememory (&params, sizeof(params));
 }
 
 static void aes_s390x_cbc_dec(void *context, unsigned char *iv,
 			      void *outbuf_arg, const void *inbuf_arg,
 			      size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   u128_t params[3];
 
   /* Prepare parameter block (ICV & key). */
   memcpy (&params[0], iv, BLOCKSIZE);
   memcpy (&params[1], ctx->keyschenc, 32);
 
   kmc_execute (ctx->kmc_func | KM_DECRYPT, &params, out, in,
 	       nblocks * BLOCKSIZE);
 
   /* Update IV with OCV. */
   memcpy (iv, &params[0], BLOCKSIZE);
 
   wipememory (&params, sizeof(params));
 }
 
 static void aes_s390x_cfb128_enc(void *context, unsigned char *iv,
 				 void *outbuf_arg, const void *inbuf_arg,
 				 size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   unsigned int function;
   u128_t params[3];
 
   /* Prepare parameter block. */
   memcpy (&params[0], iv, BLOCKSIZE);
   memcpy (&params[1], ctx->keyschenc, 32);
 
   function = ctx->kmf_func | KM_ENCRYPT | KMF_LCFB_16;
   kmf_execute (function, &params, out, in, nblocks * BLOCKSIZE);
 
   /* Update IV with OCV. */
   memcpy (iv, &params[0], BLOCKSIZE);
 
   wipememory (&params, sizeof(params));
 }
 
 static void aes_s390x_cfb128_dec(void *context, unsigned char *iv,
 				 void *outbuf_arg, const void *inbuf_arg,
 				 size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   u128_t blocks[64];
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   size_t max_blocks_used = 0;
 
   /* AES128-CFB128 decryption speed using KMF was observed to be the same as
    * the KMF encryption, ~1.03 cpb. Expection was to see similar performance
    * as for AES128-CBC decryption as decryption for both modes should be
    * parallalizeble (CBC shows ~0.22 cpb). Therefore there is quite a bit
    * of room for improvement and implementation below using KM instruction
    * shows ~0.70 cpb speed, ~30% improvement over KMF instruction.
    */
 
   while (nblocks >= 64)
     {
       /* Copy IV to encrypt buffer, copy (nblocks - 1) input blocks to
        * encrypt buffer and update IV. */
       asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t"
 		    "mvc  16(240, %[blocks]),   0(%[in])\n\t"
 		    "mvc 256(256, %[blocks]), 240(%[in])\n\t"
 		    "mvc 512(256, %[blocks]), 496(%[in])\n\t"
 		    "mvc 768(256, %[blocks]), 752(%[in])\n\t"
 		    "mvc 0(16, %[iv]), 1008(%[in])\n\t"
 		    :
 		    : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks),
 		      [iv] "a" (iv)
 		    : "memory");
 
       /* Perform encryption of temporary buffer. */
       km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks,
 		  64 * BLOCKSIZE);
 
       /* Xor encrypt buffer with input blocks and store to output blocks. */
       asm volatile ("xc   0(256, %[blocks]),   0(%[in])\n\t"
 		    "xc 256(256, %[blocks]), 256(%[in])\n\t"
 		    "xc 512(256, %[blocks]), 512(%[in])\n\t"
 		    "xc 768(256, %[blocks]), 768(%[in])\n\t"
 		    "mvc   0(256, %[out]),   0(%[blocks])\n\t"
 		    "mvc 256(256, %[out]), 256(%[blocks])\n\t"
 		    "mvc 512(256, %[out]), 512(%[blocks])\n\t"
 		    "mvc 768(256, %[out]), 768(%[blocks])\n\t"
 		    :
 		    : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks)
 		    : "memory");
 
       max_blocks_used = 64;
       in += 64 * BLOCKSIZE;
       out += 64 * BLOCKSIZE;
       nblocks -= 64;
     }
 
   if (nblocks)
     {
       unsigned int pos = 0;
       size_t in_nblocks = nblocks;
       size_t num_in = 0;
 
       max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
 
       /* Copy IV to encrypt buffer. */
       asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t"
 		    :
 		    : [blocks] "a" (blocks), [iv] "a" (iv)
 		    : "memory");
       pos += 1;
 
 #define CFB_MOVE_BLOCKS(block_oper, move_nbytes) \
       block_oper (in_nblocks - 1 >= move_nbytes / BLOCKSIZE) \
 	{ \
 	  unsigned int move_nblocks = move_nbytes / BLOCKSIZE; \
 	  asm volatile ("mvc 0(" #move_nbytes ", %[blocks_x]), 0(%[in])\n\t" \
 			: \
 			: [blocks_x] "a" (&blocks[pos]), [in] "a" (in) \
 			: "memory"); \
 	  num_in += move_nblocks; \
 	  in += move_nblocks * BLOCKSIZE; \
 	  pos += move_nblocks; \
           in_nblocks -= move_nblocks; \
 	}
 
       /* Copy (nblocks - 1) input blocks to encrypt buffer. */
       CFB_MOVE_BLOCKS(while, 256);
       CFB_MOVE_BLOCKS(if, 128);
       CFB_MOVE_BLOCKS(if, 64);
       CFB_MOVE_BLOCKS(if, 32);
       CFB_MOVE_BLOCKS(if, 16);
 
 #undef CFB_MOVE_BLOCKS
 
       /* Update IV. */
       asm volatile ("mvc 0(16, %[iv]), 0(%[in])\n\t"
 		    :
 		    : [iv] "a" (iv), [in] "a" (in)
 		    : "memory");
       num_in += 1;
       in += BLOCKSIZE;
 
       /* Perform encryption of temporary buffer. */
       km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks,
 		  nblocks * BLOCKSIZE);
 
       /* Xor encrypt buffer with input blocks and store to output blocks. */
       pos = 0;
       in -= nblocks * BLOCKSIZE;
 
 #define CFB_XOR_BLOCKS(block_oper, xor_nbytes) \
       block_oper (nblocks >= xor_nbytes / BLOCKSIZE) \
 	{ \
 	  unsigned int xor_nblocks = xor_nbytes / BLOCKSIZE; \
 	  asm volatile ("xc 0(" #xor_nbytes ", %[blocks_x]), 0(%[in])\n\t" \
 			"mvc 0(" #xor_nbytes ", %[out]), 0(%[blocks_x])\n\t" \
 			: \
 			: [blocks_x] "a" (&blocks[pos]), [out] "a" (out), \
 			  [in] "a" (in) \
 			: "memory"); \
 	  out += xor_nblocks * BLOCKSIZE; \
 	  in += xor_nblocks * BLOCKSIZE; \
 	  nblocks -= xor_nblocks; \
 	  pos += xor_nblocks; \
 	}
 
       CFB_XOR_BLOCKS(while, 256);
       CFB_XOR_BLOCKS(if, 128);
       CFB_XOR_BLOCKS(if, 64);
       CFB_XOR_BLOCKS(if, 32);
       CFB_XOR_BLOCKS(if, 16);
 
 #undef CFB_XOR_BLOCKS
     }
 
   if (max_blocks_used)
     wipememory (&blocks, max_blocks_used * BLOCKSIZE);
 }
 
 static void aes_s390x_ofb_enc(void *context, unsigned char *iv,
 			      void *outbuf_arg, const void *inbuf_arg,
 			      size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   unsigned int function;
   u128_t params[3];
 
   /* Prepare parameter block. */
   memcpy (&params[0], iv, BLOCKSIZE);
   memcpy (&params[1], ctx->keyschenc, 32);
 
   function = ctx->kmo_func | KM_ENCRYPT;
   kmo_execute (function, &params, out, in, nblocks * BLOCKSIZE);
 
   /* Update IV with OCV. */
   memcpy (iv, &params[0], BLOCKSIZE);
 
   wipememory (&params, sizeof(params));
 }
 
 static void aes_s390x_ctr128_enc(void *context, unsigned char *ctr,
 				 void *outbuf_arg, const void *inbuf_arg,
 				 size_t nblocks)
 {
   RIJNDAEL_context *ctx = context;
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   unsigned int function;
   struct aes_s390x_gcm_params_s params;
 
   memset (&params.hash_subkey, 0, sizeof(params.hash_subkey));
   memcpy (&params.key, ctx->keyschenc, 32);
 
   function = ctx->kma_func | KM_DECRYPT | KMA_HS | KMA_LAAD;
 
   while (nblocks)
     {
       u64 to_overflow = (u64)0xFFFFFFFFU + 1 - buf_get_be32 (ctr + 12);
       u64 ncurr = nblocks > to_overflow ? to_overflow : nblocks;
 
       /* Prepare parameter block. */
       memset (&params.reserved, 0, sizeof(params.reserved));
       buf_put_be32 (&params.counter_value, buf_get_be32(ctr + 12) - 1);
       memcpy (&params.initial_counter_value, ctr, 16);
       params.initial_counter_value[3] = params.counter_value;
       memset (&params.tag, 0, sizeof(params.tag));
       params.total_aad_length = 0;
       params.total_cipher_length = 0;
 
       /* Update counter. */
       cipher_block_add (ctr, ncurr, BLOCKSIZE);
       if (ncurr == (u64)0xFFFFFFFFU + 1)
 	cipher_block_add (ctr, 1, BLOCKSIZE);
 
       /* Perform CTR using KMA-GCM. */
       kma_execute (function, &params, out, in, ncurr * BLOCKSIZE, NULL, 0);
 
       out += ncurr * BLOCKSIZE;
       in += ncurr * BLOCKSIZE;
       nblocks -= ncurr;
     }
 
   wipememory (&params, sizeof(params));
 }
 
 static size_t aes_s390x_gcm_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
 				  const void *inbuf_arg, size_t nblocks,
 				  int encrypt)
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   byte *ctr = c->u_ctr.ctr;
   unsigned int function;
   struct aes_s390x_gcm_params_s params;
 
   function = ctx->kma_func | (encrypt ? KM_ENCRYPT : KM_DECRYPT)
 	      | KMA_HS | KMA_LAAD;
 
   /* Prepare parameter block. */
   memset (&params.reserved, 0, sizeof(params.reserved));
   buf_put_be32 (&params.counter_value, buf_get_be32(ctr + 12) - 1);
   memcpy (&params.tag, c->u_mode.gcm.u_tag.tag, 16);
   memcpy (&params.hash_subkey, c->u_mode.gcm.u_ghash_key.key, 16);
   params.total_aad_length = 0;
   params.total_cipher_length = 0;
   memcpy (&params.initial_counter_value, ctr, 12);
   params.initial_counter_value[3] = params.counter_value;
   memcpy (&params.key, ctx->keyschenc, 32);
 
   /* Update counter (CTR32). */
   buf_put_be32(ctr + 12, buf_get_be32(ctr + 12) + nblocks);
 
   /* Perform KMA-GCM. */
   kma_execute (function, &params, out, in, nblocks * BLOCKSIZE, NULL, 0);
 
   /* Update tag. */
   memcpy (c->u_mode.gcm.u_tag.tag, &params.tag, 16);
 
   wipememory (&params, sizeof(params));
 
   return 0;
 }
 
 static void aes_s390x_xts_crypt(void *context, unsigned char *tweak,
 				void *outbuf_arg, const void *inbuf_arg,
 				size_t nblocks, int encrypt)
 {
   RIJNDAEL_context *ctx = context;
   byte *out = outbuf_arg;
   const byte *in = inbuf_arg;
   unsigned int function;
   u128_t params[3];
   u128_t *params_tweak;
 
   if (ctx->rounds < 12)
     {
       memcpy (&params[0], ctx->keyschenc, 16);
       params_tweak = &params[1];
       memcpy (params_tweak, tweak, BLOCKSIZE);
     }
   else if (ctx->rounds == 12)
     {
       BUG(); /* KM-XTS-AES-192 not defined. */
     }
   else
     {
       memcpy (&params[0], ctx->keyschenc, 32);
       params_tweak = &params[2];
       memcpy (params_tweak, tweak, BLOCKSIZE);
     }
 
   function = ctx->km_func_xts | (encrypt ? KM_ENCRYPT : KM_DECRYPT);
   km_execute (function, &params, out, in, nblocks * BLOCKSIZE);
 
   /* Update tweak with XTSP. */
   memcpy (tweak, params_tweak, BLOCKSIZE);
 
   wipememory (&params, sizeof(params));
 }
 
 static NO_INLINE void
 aes_s390x_ocb_prepare_Ls (gcry_cipher_hd_t c, u64 blkn, const void *Ls[64],
 			  const void ***pl)
 {
   unsigned int n = 64 - (blkn % 64);
   int i;
 
   /* Prepare L pointers. */
   *pl = &Ls[(63 + n) % 64];
   for (i = 0; i < 64; i += 8, n = (n + 8) % 64)
     {
       static const int lastL[8] = { 3, 4, 3, 5, 3, 4, 3, 0 };
 
       Ls[(0 + n) % 64] = c->u_mode.ocb.L[0];
       Ls[(1 + n) % 64] = c->u_mode.ocb.L[1];
       Ls[(2 + n) % 64] = c->u_mode.ocb.L[0];
       Ls[(3 + n) % 64] = c->u_mode.ocb.L[2];
       Ls[(4 + n) % 64] = c->u_mode.ocb.L[0];
       Ls[(5 + n) % 64] = c->u_mode.ocb.L[1];
       Ls[(6 + n) % 64] = c->u_mode.ocb.L[0];
       Ls[(7 + n) % 64] = c->u_mode.ocb.L[lastL[i / 8]];
     }
 }
 
 static NO_INLINE void
 aes_s390x_ocb_checksum (unsigned char *checksum, const void *plainbuf_arg,
 			size_t nblks)
 {
   const char *plainbuf = plainbuf_arg;
   u64 tmp0[2];
   u64 tmp1[2] = { 0, 0 };
   u64 tmp2[2] = { 0, 0 };
   u64 tmp3[2] = { 0, 0 };
 
   cipher_block_cpy (tmp0, checksum, BLOCKSIZE);
 
   if (nblks >= 4)
     {
       while (nblks >= 4)
 	{
 	  /* Checksum_i = Checksum_{i-1} xor P_i  */
 	  cipher_block_xor_1 (tmp0, plainbuf + 0 * BLOCKSIZE, BLOCKSIZE);
 	  cipher_block_xor_1 (tmp1, plainbuf + 1 * BLOCKSIZE, BLOCKSIZE);
 	  cipher_block_xor_1 (tmp2, plainbuf + 2 * BLOCKSIZE, BLOCKSIZE);
 	  cipher_block_xor_1 (tmp3, plainbuf + 3 * BLOCKSIZE, BLOCKSIZE);
 
 	  plainbuf += 4 * BLOCKSIZE;
 	  nblks -= 4;
 	}
 
       cipher_block_xor_1 (tmp0, tmp1, BLOCKSIZE);
       cipher_block_xor_1 (tmp2, tmp3, BLOCKSIZE);
       cipher_block_xor_1 (tmp0, tmp2, BLOCKSIZE);
 
       wipememory (tmp1, sizeof(tmp1));
       wipememory (tmp2, sizeof(tmp2));
       wipememory (tmp3, sizeof(tmp3));
     }
 
   while (nblks > 0)
     {
       /* Checksum_i = Checksum_{i-1} xor P_i  */
       cipher_block_xor_1 (tmp0, plainbuf, BLOCKSIZE);
 
       plainbuf += BLOCKSIZE;
       nblks--;
     }
 
   cipher_block_cpy (checksum, tmp0, BLOCKSIZE);
 
   wipememory (tmp0, sizeof(tmp0));
 }
 
 static NO_INLINE size_t
 aes_s390x_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 		   const void *inbuf_arg, size_t nblocks_arg)
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   size_t nblocks = nblocks_arg;
   u128_t blocks[64];
   u128_t offset;
   size_t max_blocks_used = 0;
   u64 blkn = c->u_mode.ocb.data_nblocks;
   unsigned int function = ctx->km_func | KM_ENCRYPT;
   const void *Ls[64];
   const void **pl;
 
   aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl);
 
   /* Checksumming could be done inline in OCB_INPUT macros, but register
    * pressure becomes too heavy and performance would end up being worse.
    * For decryption, checksumming is part of OCB_OUTPUT macros as
    * output handling is less demanding and can handle the additional
    * computation. */
   aes_s390x_ocb_checksum (c->u_ctr.ctr, inbuf_arg, nblocks_arg);
 
   cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE);
 
 #define OCB_INPUT(n) \
       cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \
       cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \
 			&offset, BLOCKSIZE)
 
 #define OCB_INPUT_4(n) \
       OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \
       OCB_INPUT((n) + 3)
 
 #define OCB_INPUT_16(n) \
       OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \
       OCB_INPUT_4((n) + 12);
 
 #define OCB_OUTPUT(n) \
       cipher_block_xor_1 (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE)
 
 #define OCB_OUTPUT_4(n) \
       OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \
       OCB_OUTPUT((n) + 3)
 
 #define OCB_OUTPUT_16(n) \
       OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \
       OCB_OUTPUT_4((n) + 12);
 
   while (nblocks >= 64)
     {
       blkn += 64;
       *pl = ocb_get_l(c, blkn - blkn % 64);
 
       OCB_INPUT_16(0);
       OCB_INPUT_16(16);
       OCB_INPUT_16(32);
       OCB_INPUT_16(48);
 
       km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE);
 
       asm volatile ("xc   0(256, %[out]),   0(%[blocks])\n\t"
 		    "xc 256(256, %[out]), 256(%[blocks])\n\t"
 		    "xc 512(256, %[out]), 512(%[blocks])\n\t"
 		    "xc 768(256, %[out]), 768(%[blocks])\n\t"
 		    :
 		    : [out] "a" (outbuf), [blocks] "a" (blocks)
 		    : "memory");
 
       max_blocks_used = 64;
       inbuf += 64 * BLOCKSIZE;
       outbuf += 64 * BLOCKSIZE;
       nblocks -= 64;
     }
 
   if (nblocks)
     {
       unsigned int pos = 0;
 
       max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
 
       blkn += nblocks;
       *pl = ocb_get_l(c, blkn - blkn % 64);
 
       while (nblocks >= 16)
 	{
 	  OCB_INPUT_16(pos + 0);
 	  pos += 16;
 	  nblocks -= 16;
 	}
       while (nblocks >= 4)
 	{
 	  OCB_INPUT_4(pos + 0);
 	  pos += 4;
 	  nblocks -= 4;
 	}
       if (nblocks >= 2)
 	{
 	  OCB_INPUT(pos + 0);
 	  OCB_INPUT(pos + 1);
 	  pos += 2;
 	  nblocks -= 2;
 	}
       if (nblocks >= 1)
 	{
 	  OCB_INPUT(pos + 0);
 	  pos += 1;
 	  nblocks -= 1;
 	}
 
       nblocks = pos;
       pos = 0;
       km_execute (function, ctx->keyschenc, outbuf, outbuf,
 		  nblocks * BLOCKSIZE);
 
       while (nblocks >= 16)
 	{
 	  OCB_OUTPUT_16(pos + 0);
 	  pos += 16;
 	  nblocks -= 16;
 	}
       while (nblocks >= 4)
 	{
 	  OCB_OUTPUT_4(pos + 0);
 	  pos += 4;
 	  nblocks -= 4;
 	}
       if (nblocks >= 2)
 	{
 	  OCB_OUTPUT(pos + 0);
 	  OCB_OUTPUT(pos + 1);
 	  pos += 2;
 	  nblocks -= 2;
 	}
       if (nblocks >= 1)
 	{
 	  OCB_OUTPUT(pos + 0);
 	  pos += 1;
 	  nblocks -= 1;
 	}
     }
 
 #undef OCB_INPUT
 #undef OCB_INPUT_4
 #undef OCB_INPUT_16
 #undef OCB_OUTPUT
 #undef OCB_OUTPUT_4
 #undef OCB_OUTPUT_16
 
   c->u_mode.ocb.data_nblocks = blkn;
   cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE);
 
   if (max_blocks_used)
     wipememory (&blocks, max_blocks_used * BLOCKSIZE);
 
   return 0;
 }
 
 static NO_INLINE size_t
 aes_s390x_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		   const void *inbuf_arg, size_t nblocks_arg)
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   size_t nblocks = nblocks_arg;
   u128_t blocks[64];
   u128_t offset;
   size_t max_blocks_used = 0;
   u64 blkn = c->u_mode.ocb.data_nblocks;
   unsigned int function = ctx->km_func | KM_DECRYPT;
   const void *Ls[64];
   const void **pl;
 
   aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl);
 
   cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE);
 
 #define OCB_INPUT(n) \
       cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \
       cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \
 			&offset, BLOCKSIZE)
 
 #define OCB_INPUT_4(n) \
       OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \
       OCB_INPUT((n) + 3)
 
 #define OCB_INPUT_16(n) \
       OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \
       OCB_INPUT_4((n) + 12);
 
 #define OCB_OUTPUT(n) \
       cipher_block_xor_1 (&blocks[n], outbuf + (n) * BLOCKSIZE, BLOCKSIZE); \
       cipher_block_xor_1 (c->u_ctr.ctr, &blocks[n], BLOCKSIZE); \
       cipher_block_cpy (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE);
 
 #define OCB_OUTPUT_4(n) \
       OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \
       OCB_OUTPUT((n) + 3)
 
 #define OCB_OUTPUT_16(n) \
       OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \
       OCB_OUTPUT_4((n) + 12);
 
   while (nblocks >= 64)
     {
       blkn += 64;
       *pl = ocb_get_l(c, blkn - blkn % 64);
 
       OCB_INPUT_16(0);
       OCB_INPUT_16(16);
       OCB_INPUT_16(32);
       OCB_INPUT_16(48);
 
       km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE);
 
       asm volatile ("xc   0(256, %[out]),   0(%[blocks])\n\t"
 		    "xc 256(256, %[out]), 256(%[blocks])\n\t"
 		    "xc 512(256, %[out]), 512(%[blocks])\n\t"
 		    "xc 768(256, %[out]), 768(%[blocks])\n\t"
 		    :
 		    : [out] "a" (outbuf), [blocks] "a" (blocks)
 		    : "memory");
 
       max_blocks_used = 64;
       inbuf += 64 * BLOCKSIZE;
       outbuf += 64 * BLOCKSIZE;
       nblocks -= 64;
     }
 
   if (nblocks)
     {
       unsigned int pos = 0;
 
       max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
 
       blkn += nblocks;
       *pl = ocb_get_l(c, blkn - blkn % 64);
 
       while (nblocks >= 16)
 	{
 	  OCB_INPUT_16(pos + 0);
 	  pos += 16;
 	  nblocks -= 16;
 	}
       while (nblocks >= 4)
 	{
 	  OCB_INPUT_4(pos + 0);
 	  pos += 4;
 	  nblocks -= 4;
 	}
       if (nblocks >= 2)
 	{
 	  OCB_INPUT(pos + 0);
 	  OCB_INPUT(pos + 1);
 	  pos += 2;
 	  nblocks -= 2;
 	}
       if (nblocks >= 1)
 	{
 	  OCB_INPUT(pos + 0);
 	  pos += 1;
 	  nblocks -= 1;
 	}
 
       nblocks = pos;
       pos = 0;
       km_execute (function, ctx->keyschenc, outbuf, outbuf,
 		  nblocks * BLOCKSIZE);
 
       while (nblocks >= 16)
 	{
 	  OCB_OUTPUT_16(pos + 0);
 	  pos += 16;
 	  nblocks -= 16;
 	}
       while (nblocks >= 4)
 	{
 	  OCB_OUTPUT_4(pos + 0);
 	  pos += 4;
 	  nblocks -= 4;
 	}
       if (nblocks >= 2)
 	{
 	  OCB_OUTPUT(pos + 0);
 	  OCB_OUTPUT(pos + 1);
 	  pos += 2;
 	  nblocks -= 2;
 	}
       if (nblocks >= 1)
 	{
 	  OCB_OUTPUT(pos + 0);
 	  pos += 1;
 	  nblocks -= 1;
 	}
     }
 
 #undef OCB_INPUT
 #undef OCB_INPUT_4
 #undef OCB_INPUT_16
 #undef OCB_OUTPUT
 #undef OCB_OUTPUT_4
 #undef OCB_OUTPUT_16
 
   c->u_mode.ocb.data_nblocks = blkn;
   cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE);
 
   if (max_blocks_used)
     wipememory (&blocks, max_blocks_used * BLOCKSIZE);
 
   return 0;
 }
 
 static size_t
 aes_s390x_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 		     const void *inbuf_arg, size_t nblocks_arg, int encrypt)
 {
   if (encrypt)
     return aes_s390x_ocb_enc (c, outbuf_arg, inbuf_arg, nblocks_arg);
   else
     return aes_s390x_ocb_dec (c, outbuf_arg, inbuf_arg, nblocks_arg);
 }
 
 static size_t
 aes_s390x_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 		    size_t nblocks_arg)
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   u128_t blocks[64];
   u128_t offset;
   size_t max_blocks_used = 0;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
   unsigned int function = ctx->km_func | KM_ENCRYPT;
   const void *Ls[64];
   const void **pl;
 
   aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl);
 
   cipher_block_cpy (&offset, c->u_mode.ocb.aad_offset, BLOCKSIZE);
 
 #define OCB_INPUT(n) \
       cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \
       cipher_block_xor_1 (&blocks[n], abuf + (n) * BLOCKSIZE, BLOCKSIZE)
 
 #define OCB_INPUT_4(n) \
       OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \
       OCB_INPUT((n) + 3)
 
 #define OCB_INPUT_16(n) \
       OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \
       OCB_INPUT_4((n) + 12);
 
   while (nblocks_arg >= 64)
     {
       blkn += 64;
       *pl = ocb_get_l(c, blkn - blkn % 64);
 
       OCB_INPUT_16(0);
       OCB_INPUT_16(16);
       OCB_INPUT_16(32);
       OCB_INPUT_16(48);
 
       km_execute (function, ctx->keyschenc, blocks, blocks, 64 * BLOCKSIZE);
 
       aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, 64);
 
       max_blocks_used = 64;
       abuf += 64 * BLOCKSIZE;
       nblocks_arg -= 64;
     }
 
   if (nblocks_arg > 0)
     {
       size_t nblocks = nblocks_arg;
       unsigned int pos = 0;
 
       max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
 
       blkn += nblocks;
       *pl = ocb_get_l(c, blkn - blkn % 64);
 
       while (nblocks >= 16)
 	{
 	  OCB_INPUT_16(pos + 0);
 	  pos += 16;
 	  nblocks -= 16;
 	}
       while (nblocks >= 4)
 	{
 	  OCB_INPUT_4(pos + 0);
 	  pos += 4;
 	  nblocks -= 4;
 	}
       if (nblocks >= 2)
 	{
 	  OCB_INPUT(pos + 0);
 	  OCB_INPUT(pos + 1);
 	  pos += 2;
 	  nblocks -= 2;
 	}
       if (nblocks >= 1)
 	{
 	  OCB_INPUT(pos + 0);
 	  pos += 1;
 	  nblocks -= 1;
 	}
 
       nblocks = pos;
       nblocks_arg -= pos;
       pos = 0;
       km_execute (function, ctx->keyschenc, blocks, blocks,
 		  nblocks * BLOCKSIZE);
 
       aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, nblocks);
     }
 
 #undef OCB_INPUT
 #undef OCB_INPUT_4
 #undef OCB_INPUT_16
 
   c->u_mode.ocb.aad_nblocks = blkn;
   cipher_block_cpy (c->u_mode.ocb.aad_offset, &offset, BLOCKSIZE);
 
   if (max_blocks_used)
     wipememory (&blocks, max_blocks_used * BLOCKSIZE);
 
   return 0;
 }
 
 int _gcry_aes_s390x_setup_acceleration(RIJNDAEL_context *ctx,
 				       unsigned int keylen,
 				       unsigned int hwfeatures,
 				       cipher_bulk_ops_t *bulk_ops)
 {
   unsigned int func;
   unsigned int func_xts;
   u128_t func_mask;
   u128_t func_xts_mask;
 
   if (!(hwfeatures & HWF_S390X_MSA))
     return 0;
 
   switch (keylen)
     {
     default:
     case 16:
       func = KM_FUNCTION_AES_128;
       func_xts = KM_FUNCTION_XTS_AES_128;
       func_mask = km_function_to_mask(KM_FUNCTION_AES_128);
       func_xts_mask = km_function_to_mask(KM_FUNCTION_XTS_AES_128);
       break;
     case 24:
       func = KM_FUNCTION_AES_192;
       func_xts = 0;
       func_mask = km_function_to_mask(KM_FUNCTION_AES_192);
       func_xts_mask = 0; /* XTS-AES192 not available. */
       break;
     case 32:
       func = KM_FUNCTION_AES_256;
       func_xts = KM_FUNCTION_XTS_AES_256;
       func_mask = km_function_to_mask(KM_FUNCTION_AES_256);
       func_xts_mask = km_function_to_mask(KM_FUNCTION_AES_256);
       break;
     }
 
   /* Query KM for supported algorithms and check if acceleration for
    * requested key-length is available. */
   if (!(km_query () & func_mask))
     return 0;
 
   ctx->km_func = func;
 
   /* Query KM for supported XTS algorithms. */
   if (km_query () & func_xts_mask)
     ctx->km_func_xts = func_xts;
 
   /* Query KMC for supported algorithms. */
   if (kmc_query () & func_mask)
     ctx->kmc_func = func;
 
   /* Query KMAC for supported algorithms. */
   if (kmac_query () & func_mask)
     ctx->kmac_func = func;
 
   if (hwfeatures & HWF_S390X_MSA_4)
     {
       /* Query KMF for supported algorithms. */
       if (kmf_query () & func_mask)
 	ctx->kmf_func = func;
 
       /* Query KMO for supported algorithms. */
       if (kmo_query () & func_mask)
 	ctx->kmo_func = func;
     }
 
   if (hwfeatures & HWF_S390X_MSA_8)
     {
       /* Query KMA for supported algorithms. */
       if (kma_query () & func_mask)
 	ctx->kma_func = func;
     }
 
   /* Setup zSeries bulk encryption/decryption routines. */
 
   if (ctx->km_func)
     {
       bulk_ops->ocb_crypt = aes_s390x_ocb_crypt;
       bulk_ops->ocb_auth = aes_s390x_ocb_auth;
 
       /* CFB128 decryption uses KM instruction, instead of KMF. */
       bulk_ops->cfb_dec = aes_s390x_cfb128_dec;
     }
 
   if (ctx->km_func_xts)
     {
       bulk_ops->xts_crypt = aes_s390x_xts_crypt;
     }
 
   if (ctx->kmc_func)
     {
       if(ctx->kmac_func)
 	{
 	  /* Either KMC or KMAC used depending on 'cbc_mac' parameter. */
 	  bulk_ops->cbc_enc = aes_s390x_cbc_enc;
 	}
 
       bulk_ops->cbc_dec = aes_s390x_cbc_dec;
     }
 
   if (ctx->kmf_func)
     {
       bulk_ops->cfb_enc = aes_s390x_cfb128_enc;
     }
 
   if (ctx->kmo_func)
     {
       bulk_ops->ofb_enc = aes_s390x_ofb_enc;
     }
 
   if (ctx->kma_func)
     {
       bulk_ops->ctr_enc = aes_s390x_ctr128_enc;
 
       if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH))
 	{
 	  /* KIMD based GHASH implementation is required with AES-GCM
 	   * acceleration. */
 	  bulk_ops->gcm_crypt = aes_s390x_gcm_crypt;
 	}
     }
 
   return 1;
 }
 
 void _gcry_aes_s390x_setkey(RIJNDAEL_context *ctx, const byte *key)
 {
   unsigned int keylen = 16 + (ctx->rounds - 10) * 4;
   memcpy (ctx->keyschenc, key, keylen);
 }
 
 void _gcry_aes_s390x_prepare_decryption(RIJNDAEL_context *ctx)
 {
   /* Do nothing. */
   (void)ctx;
 }
 
 #endif /* USE_S390X_CRYPTO */
diff --git a/cipher/sha1.c b/cipher/sha1.c
index d3ee982b..287bd826 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -1,709 +1,765 @@
 /* sha1.c - SHA1 hash function
  * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 
 /*  Test vectors:
  *
  *  "abc"
  *  A999 3E36 4706 816A BA3E  2571 7850 C26C 9CD0 D89D
  *
  *  "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
  *  8498 3E44 1C3B D26E BAAE  4AA1 F951 29E5 E546 70F1
  */
 
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef HAVE_STDINT_H
 # include <stdint.h>
 #endif
 
 #include "g10lib.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
 #include "sha1.h"
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX indicates whether to compile with Intel AVX code. */
 #undef USE_AVX
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX 1
 #endif
 
 /* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */
 #undef USE_BMI2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
     defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_BMI2 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
 #undef USE_AVX2
 #if defined(USE_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX2)
 # define USE_AVX2 1
 #endif
 
 /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
 #undef USE_SHAEXT
 #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
     defined(HAVE_GCC_INLINE_ASM_SSE41) && \
     defined(ENABLE_SHAEXT_SUPPORT)
 # define USE_SHAEXT 1
 #endif
 
 /* USE_NEON indicates whether to enable ARM NEON assembly code. */
 #undef USE_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_NEON 1
 # endif
 #endif
 
 /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
  * code. */
 #undef USE_ARM_CE
 #ifdef ENABLE_ARM_CRYPTO_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
 #  define USE_ARM_CE 1
 # elif defined(__AARCH64EL__) \
        && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
        && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 #  define USE_ARM_CE 1
 # endif
 #endif
 
+
 /* A macro to test whether P is properly aligned for an u32 type.
    Note that config.h provides a suitable replacement for uintptr_t if
    it does not exist in stdint.h.  */
 /* #if __GNUC__ >= 2 */
 /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % __alignof__ (u32))) */
 /* #else */
 /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % sizeof (u32))) */
 /* #endif */
 
 
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
 #if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \
     defined(USE_SHAEXT)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 #  define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4)
 # else
 #  define ASM_FUNC_ABI
 #  define ASM_EXTRA_STACK 0
 # endif
 #endif
 
 
 #ifdef USE_SSSE3
 unsigned int
 _gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data,
                                   size_t nblks) ASM_FUNC_ABI;
 
 static unsigned int
 do_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data,
                                size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
   return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks)
          + ASM_EXTRA_STACK;
 }
 #endif
 
 #ifdef USE_AVX
 unsigned int
 _gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data,
                                  size_t nblks) ASM_FUNC_ABI;
 
 static unsigned int
 do_sha1_transform_amd64_avx (void *ctx, const unsigned char *data,
                              size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
   return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks)
          + ASM_EXTRA_STACK;
 }
 #endif
 
 #ifdef USE_BMI2
 unsigned int
 _gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
                                      size_t nblks) ASM_FUNC_ABI;
 
 static unsigned int
 do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
                                   size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
   return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
          + ASM_EXTRA_STACK;
 }
 
 #ifdef USE_AVX2
 unsigned int
 _gcry_sha1_transform_amd64_avx2_bmi2 (void *state, const unsigned char *data,
                                       size_t nblks) ASM_FUNC_ABI;
 
 static unsigned int
 do_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
                                    size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
 
   /* AVX2/BMI2 function only handles pair of blocks so nblks needs to be
    * multiple of 2 and function does not handle zero nblks. Use AVX/BMI2
    * code to handle these cases. */
 
   if (nblks <= 1)
     return do_sha1_transform_amd64_avx_bmi2 (ctx, data, nblks);
 
   if (nblks & 1)
     {
       (void)_gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, 1);
       nblks--;
       data += 64;
     }
 
   return _gcry_sha1_transform_amd64_avx2_bmi2 (&hd->h0, data, nblks)
          + ASM_EXTRA_STACK;
 }
 #endif /* USE_AVX2 */
 #endif /* USE_BMI2 */
 
 #ifdef USE_SHAEXT
 /* Does not need ASM_FUNC_ABI */
 unsigned int
 _gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data,
                                    size_t nblks);
 
 static unsigned int
 do_sha1_transform_intel_shaext (void *ctx, const unsigned char *data,
                                 size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
   return _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks);
 }
 #endif
 
 #ifdef USE_NEON
 unsigned int
 _gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data,
                                  size_t nblks);
 
 static unsigned int
 do_sha1_transform_armv7_neon (void *ctx, const unsigned char *data,
                               size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
   return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks);
 }
 #endif
 
 #ifdef USE_ARM_CE
 unsigned int
 _gcry_sha1_transform_armv8_ce (void *state, const unsigned char *data,
                                size_t nblks);
 
 static unsigned int
 do_sha1_transform_armv8_ce (void *ctx, const unsigned char *data,
                             size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
   return _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
 }
 #endif
 
+#ifdef SHA1_USE_S390X_CRYPTO
+#include "asm-inline-s390x.h"
+
+static unsigned int
+do_sha1_transform_s390x (void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+  kimd_execute (KMID_FUNCTION_SHA1, &hd->h0, data, nblks * 64);
+  return 0;
+}
+
+static unsigned int
+do_sha1_final_s390x (void *ctx, const unsigned char *data, size_t datalen,
+		     u32 len_msb, u32 len_lsb)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+  /* Make sure that 'final_len' is positioned at correct offset relative
+   * to 'h0'. This is because we are passing 'h0' pointer as start of
+   * parameter block to 'klmd' instruction. */
+
+  gcry_assert (offsetof (SHA1_CONTEXT, final_len_msb)
+	       - offsetof (SHA1_CONTEXT, h0) == 5 * sizeof(u32));
+  gcry_assert (offsetof (SHA1_CONTEXT, final_len_lsb)
+	       - offsetof (SHA1_CONTEXT, final_len_msb) == 1 * sizeof(u32));
+
+  hd->final_len_msb = len_msb;
+  hd->final_len_lsb = len_lsb;
+
+  klmd_execute (KMID_FUNCTION_SHA1, &hd->h0, data, datalen);
+  return 0;
+}
+#endif
+
 
 static unsigned int
 do_transform_generic (void *c, const unsigned char *data, size_t nblks);
 
 
 static void
 sha1_init (void *context, unsigned int flags)
 {
   SHA1_CONTEXT *hd = context;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = 0x67452301;
   hd->h1 = 0xefcdab89;
   hd->h2 = 0x98badcfe;
   hd->h3 = 0x10325476;
   hd->h4 = 0xc3d2e1f0;
 
   hd->bctx.nblocks = 0;
   hd->bctx.nblocks_high = 0;
   hd->bctx.count = 0;
   hd->bctx.blocksize_shift = _gcry_ctz(64);
 
   /* Order of feature checks is important here; last match will be
    * selected.  Keep slower implementations at the top and faster at
    * the bottom.  */
   hd->bctx.bwrite = do_transform_generic;
 #ifdef USE_SSSE3
   if ((features & HWF_INTEL_SSSE3) != 0)
     hd->bctx.bwrite = do_sha1_transform_amd64_ssse3;
 #endif
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
   if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
     hd->bctx.bwrite = do_sha1_transform_amd64_avx;
 #endif
 #ifdef USE_BMI2
   if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2))
     hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2;
 #endif
 #ifdef USE_AVX2
   if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_AVX) &&
       (features & HWF_INTEL_BMI2))
     hd->bctx.bwrite = do_sha1_transform_amd64_avx2_bmi2;
 #endif
 #ifdef USE_SHAEXT
   if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
     hd->bctx.bwrite = do_sha1_transform_intel_shaext;
 #endif
 #ifdef USE_NEON
   if ((features & HWF_ARM_NEON) != 0)
     hd->bctx.bwrite = do_sha1_transform_armv7_neon;
 #endif
 #ifdef USE_ARM_CE
   if ((features & HWF_ARM_SHA1) != 0)
     hd->bctx.bwrite = do_sha1_transform_armv8_ce;
 #endif
+#ifdef SHA1_USE_S390X_CRYPTO
+  hd->use_s390x_crypto = 0;
+  if ((features & HWF_S390X_MSA) != 0)
+    {
+      if ((kimd_query () & km_function_to_mask (KMID_FUNCTION_SHA1)) &&
+	  (klmd_query () & km_function_to_mask (KMID_FUNCTION_SHA1)))
+	{
+	  hd->bctx.bwrite = do_sha1_transform_s390x;
+	  hd->use_s390x_crypto = 1;
+	}
+    }
+#endif
 
   (void)features;
 }
 
 /*
  * Initialize the context HD. This is used to prepare the use of
  * _gcry_sha1_mixblock.  WARNING: This is a special purpose function
  * for exclusive use by random-csprng.c.
  */
 void
 _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd)
 {
   sha1_init (hd, 0);
 }
 
 
 /* Round function macros. */
 #define K1  0x5A827999L
 #define K2  0x6ED9EBA1L
 #define K3  0x8F1BBCDCL
 #define K4  0xCA62C1D6L
 #define F1(x,y,z)   ( z ^ ( x & ( y ^ z ) ) )
 #define F2(x,y,z)   ( x ^ y ^ z )
 #define F3(x,y,z)   ( ( x & y ) | ( z & ( x | y ) ) )
 #define F4(x,y,z)   ( x ^ y ^ z )
 #define M(i) ( tm =    x[ i    &0x0f]  \
                      ^ x[(i-14)&0x0f]  \
 	 	     ^ x[(i-8) &0x0f]  \
                      ^ x[(i-3) &0x0f], \
                      (x[i&0x0f] = rol(tm, 1)))
 #define R(a,b,c,d,e,f,k,m)  do { e += rol( a, 5 )     \
 	                              + f( b, c, d )  \
 		 		      + k	      \
 			 	      + m;	      \
 				 b = rol( b, 30 );    \
 			       } while(0)
 
 /*
  * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA.
  */
 static unsigned int
 do_transform_generic (void *ctx, const unsigned char *data, size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
 
   do
     {
       const u32 *idata = (const void *)data;
       u32 a, b, c, d, e; /* Local copies of the chaining variables.  */
       u32 tm;            /* Helper.  */
       u32 x[16];         /* The array we work on. */
 
 #define I(i) (x[i] = buf_get_be32(idata + i))
 
       /* Get the values of the chaining variables. */
       a = hd->h0;
       b = hd->h1;
       c = hd->h2;
       d = hd->h3;
       e = hd->h4;
 
       /* Transform. */
       R( a, b, c, d, e, F1, K1, I( 0) );
       R( e, a, b, c, d, F1, K1, I( 1) );
       R( d, e, a, b, c, F1, K1, I( 2) );
       R( c, d, e, a, b, F1, K1, I( 3) );
       R( b, c, d, e, a, F1, K1, I( 4) );
       R( a, b, c, d, e, F1, K1, I( 5) );
       R( e, a, b, c, d, F1, K1, I( 6) );
       R( d, e, a, b, c, F1, K1, I( 7) );
       R( c, d, e, a, b, F1, K1, I( 8) );
       R( b, c, d, e, a, F1, K1, I( 9) );
       R( a, b, c, d, e, F1, K1, I(10) );
       R( e, a, b, c, d, F1, K1, I(11) );
       R( d, e, a, b, c, F1, K1, I(12) );
       R( c, d, e, a, b, F1, K1, I(13) );
       R( b, c, d, e, a, F1, K1, I(14) );
       R( a, b, c, d, e, F1, K1, I(15) );
       R( e, a, b, c, d, F1, K1, M(16) );
       R( d, e, a, b, c, F1, K1, M(17) );
       R( c, d, e, a, b, F1, K1, M(18) );
       R( b, c, d, e, a, F1, K1, M(19) );
       R( a, b, c, d, e, F2, K2, M(20) );
       R( e, a, b, c, d, F2, K2, M(21) );
       R( d, e, a, b, c, F2, K2, M(22) );
       R( c, d, e, a, b, F2, K2, M(23) );
       R( b, c, d, e, a, F2, K2, M(24) );
       R( a, b, c, d, e, F2, K2, M(25) );
       R( e, a, b, c, d, F2, K2, M(26) );
       R( d, e, a, b, c, F2, K2, M(27) );
       R( c, d, e, a, b, F2, K2, M(28) );
       R( b, c, d, e, a, F2, K2, M(29) );
       R( a, b, c, d, e, F2, K2, M(30) );
       R( e, a, b, c, d, F2, K2, M(31) );
       R( d, e, a, b, c, F2, K2, M(32) );
       R( c, d, e, a, b, F2, K2, M(33) );
       R( b, c, d, e, a, F2, K2, M(34) );
       R( a, b, c, d, e, F2, K2, M(35) );
       R( e, a, b, c, d, F2, K2, M(36) );
       R( d, e, a, b, c, F2, K2, M(37) );
       R( c, d, e, a, b, F2, K2, M(38) );
       R( b, c, d, e, a, F2, K2, M(39) );
       R( a, b, c, d, e, F3, K3, M(40) );
       R( e, a, b, c, d, F3, K3, M(41) );
       R( d, e, a, b, c, F3, K3, M(42) );
       R( c, d, e, a, b, F3, K3, M(43) );
       R( b, c, d, e, a, F3, K3, M(44) );
       R( a, b, c, d, e, F3, K3, M(45) );
       R( e, a, b, c, d, F3, K3, M(46) );
       R( d, e, a, b, c, F3, K3, M(47) );
       R( c, d, e, a, b, F3, K3, M(48) );
       R( b, c, d, e, a, F3, K3, M(49) );
       R( a, b, c, d, e, F3, K3, M(50) );
       R( e, a, b, c, d, F3, K3, M(51) );
       R( d, e, a, b, c, F3, K3, M(52) );
       R( c, d, e, a, b, F3, K3, M(53) );
       R( b, c, d, e, a, F3, K3, M(54) );
       R( a, b, c, d, e, F3, K3, M(55) );
       R( e, a, b, c, d, F3, K3, M(56) );
       R( d, e, a, b, c, F3, K3, M(57) );
       R( c, d, e, a, b, F3, K3, M(58) );
       R( b, c, d, e, a, F3, K3, M(59) );
       R( a, b, c, d, e, F4, K4, M(60) );
       R( e, a, b, c, d, F4, K4, M(61) );
       R( d, e, a, b, c, F4, K4, M(62) );
       R( c, d, e, a, b, F4, K4, M(63) );
       R( b, c, d, e, a, F4, K4, M(64) );
       R( a, b, c, d, e, F4, K4, M(65) );
       R( e, a, b, c, d, F4, K4, M(66) );
       R( d, e, a, b, c, F4, K4, M(67) );
       R( c, d, e, a, b, F4, K4, M(68) );
       R( b, c, d, e, a, F4, K4, M(69) );
       R( a, b, c, d, e, F4, K4, M(70) );
       R( e, a, b, c, d, F4, K4, M(71) );
       R( d, e, a, b, c, F4, K4, M(72) );
       R( c, d, e, a, b, F4, K4, M(73) );
       R( b, c, d, e, a, F4, K4, M(74) );
       R( a, b, c, d, e, F4, K4, M(75) );
       R( e, a, b, c, d, F4, K4, M(76) );
       R( d, e, a, b, c, F4, K4, M(77) );
       R( c, d, e, a, b, F4, K4, M(78) );
       R( b, c, d, e, a, F4, K4, M(79) );
 
       /* Update the chaining variables. */
       hd->h0 += a;
       hd->h1 += b;
       hd->h2 += c;
       hd->h3 += d;
       hd->h4 += e;
 
       data += 64;
     }
   while (--nblks);
 
   return 88+4*sizeof(void*);
 }
 
 
 /*
  * Apply the SHA-1 transform function on the buffer BLOCKOF64BYTE
  * which must have a length 64 bytes.  BLOCKOF64BYTE must be 32-bit
  * aligned.  Updates the 20 bytes in BLOCKOF64BYTE with its mixed
  * content.  Returns the number of bytes which should be burned on the
  * stack.  You need to use _gcry_sha1_mixblock_init to initialize the
  * context.
  * WARNING: This is a special purpose function for exclusive use by
  * random-csprng.c.
  */
 unsigned int
 _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte)
 {
   u32 *p = blockof64byte;
   unsigned int nburn;
 
   nburn = (*hd->bctx.bwrite) (hd, blockof64byte, 1);
   p[0] = hd->h0;
   p[1] = hd->h1;
   p[2] = hd->h2;
   p[3] = hd->h3;
   p[4] = hd->h4;
 
   return nburn;
 }
 
 
 /* The routine final terminates the computation and
  * returns the digest.
  * The handle is prepared for a new cycle, but adding bytes to the
  * handle will the destroy the returned buffer.
  * Returns: 20 bytes representing the digest.
  */
 
 static void
 sha1_final(void *context)
 {
   SHA1_CONTEXT *hd = context;
   u32 t, th, msb, lsb;
   unsigned char *p;
   unsigned int burn;
 
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
   else
     th = hd->bctx.nblocks >> 32;
 
   /* multiply by 64 to make a byte count */
   lsb = t << 6;
   msb = (th << 6) | (t >> 26);
   /* add the count */
   t = lsb;
   if( (lsb += hd->bctx.count) < t )
     msb++;
   /* multiply by 8 to make a bit count */
   t = lsb;
   lsb <<= 3;
   msb <<= 3;
   msb |= t >> 29;
 
-  if (hd->bctx.count < 56)  /* enough room */
+  if (0)
+    { }
+#ifdef SHA1_USE_S390X_CRYPTO
+  else if (hd->use_s390x_crypto)
+    {
+      burn = do_sha1_final_s390x (hd, hd->bctx.buf, hd->bctx.count, msb, lsb);
+    }
+#endif
+  else if (hd->bctx.count < 56)  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
       if (hd->bctx.count < 56)
 	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
       hd->bctx.count = 56;
 
       /* append the 64 bit count */
       buf_put_be32(hd->bctx.buf + 56, msb);
       buf_put_be32(hd->bctx.buf + 60, lsb);
       burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
     }
   else  /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
       /* fill pad and next block with zeroes */
       memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
       hd->bctx.count = 64 + 56;
 
       /* append the 64 bit count */
       buf_put_be32(hd->bctx.buf + 64 + 56, msb);
       buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
       burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 );
     }
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
   X(0);
   X(1);
   X(2);
   X(3);
   X(4);
 #undef X
 
   _gcry_burn_stack (burn);
 }
 
 static unsigned char *
 sha1_read( void *context )
 {
   SHA1_CONTEXT *hd = context;
 
   return hd->bctx.buf;
 }
 
 /****************
  * Shortcut functions which puts the hash value of the supplied buffer
  * into outbuf which must have a size of 20 bytes.
  */
 void
 _gcry_sha1_hash_buffer (void *outbuf, const void *buffer, size_t length)
 {
   SHA1_CONTEXT hd;
 
   sha1_init (&hd, 0);
   _gcry_md_block_write (&hd, buffer, length);
   sha1_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 20);
 }
 
 
 /* Variant of the above shortcut function using a multiple buffers.  */
 void
 _gcry_sha1_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 {
   SHA1_CONTEXT hd;
 
   sha1_init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     _gcry_md_block_write (&hd,
                           (const char*)iov[0].data + iov[0].off, iov[0].len);
   sha1_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 20);
 }
 
 
 
 /*
      Self-test section.
  */
 
 
 static gpg_err_code_t
 selftests_sha1 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA1, 0,
      "abc", 3,
      "\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E"
      "\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D", 20);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA1, 0,
          "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
          "\x84\x98\x3E\x44\x1C\x3B\xD2\x6E\xBA\xAE"
          "\x4A\xA1\xF9\x51\x29\xE5\xE5\x46\x70\xF1", 20);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA1, 1,
          NULL, 0,
          "\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E"
          "\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F", 20);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA1, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
   gpg_err_code_t ec;
 
   switch (algo)
     {
     case GCRY_MD_SHA1:
       ec = selftests_sha1 (extended, report);
       break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
 
     }
   return ec;
 }
 
 
 
 
 static unsigned char asn[15] = /* Object ID is 1.3.14.3.2.26 */
   { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03,
     0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 };
 
 static gcry_md_oid_spec_t oid_spec_sha1[] =
   {
     /* iso.member-body.us.rsadsi.pkcs.pkcs-1.5 (sha1WithRSAEncryption) */
     { "1.2.840.113549.1.1.5" },
     /* iso.member-body.us.x9-57.x9cm.3 (dsaWithSha1)*/
     { "1.2.840.10040.4.3" },
     /* from NIST's OIW  (sha1) */
     { "1.3.14.3.2.26" },
     /* from NIST OIW (sha-1WithRSAEncryption) */
     { "1.3.14.3.2.29" },
     /* iso.member-body.us.ansi-x9-62.signatures.ecdsa-with-sha1 */
     { "1.2.840.10045.4.1" },
     { NULL },
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha1 =
   {
     GCRY_MD_SHA1, {0, 1},
     "SHA1", asn, DIM (asn), oid_spec_sha1, 20,
     sha1_init, _gcry_md_block_write, sha1_final, sha1_read, NULL,
     _gcry_sha1_hash_buffer, _gcry_sha1_hash_buffers,
     sizeof (SHA1_CONTEXT),
     run_selftests
   };
diff --git a/cipher/sha1.h b/cipher/sha1.h
index acf764ba..a3597658 100644
--- a/cipher/sha1.h
+++ b/cipher/sha1.h
@@ -1,35 +1,47 @@
 /* sha1.h - SHA-1 context definition
  * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef GCRY_SHA1_H
 #define GCRY_SHA1_H
 
 #include "hash-common.h"
 
+
+/* SHA1_USE_S390X_CRYPTO indicates whether to enable zSeries code. */
+#undef SHA1_USE_S390X_CRYPTO
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define SHA1_USE_S390X_CRYPTO 1
+#endif /* SHA1_USE_S390X_CRYPTO */
+
+
 /* We need this here for direct use by random-csprng.c. */
 typedef struct
 {
   gcry_md_block_ctx_t bctx;
   u32          h0,h1,h2,h3,h4;
+#ifdef SHA1_USE_S390X_CRYPTO
+  u32          final_len_msb, final_len_lsb; /* needs to be right after h4. */
+  int          use_s390x_crypto;
+#endif
 } SHA1_CONTEXT;
 
 
 void _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd);
 unsigned int _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte);
 
 #endif /*GCRY_SHA1_H*/