diff --git a/cipher/sha1.c b/cipher/sha1.c
index e50262ff..76c486c7 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -1,671 +1,664 @@
 /* sha1.c - SHA1 hash function
  * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 
 /*  Test vectors:
  *
  *  "abc"
  *  A999 3E36 4706 816A BA3E  2571 7850 C26C 9CD0 D89D
  *
  *  "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
  *  8498 3E44 1C3B D26E BAAE  4AA1 F951 29E5 E546 70F1
  */
 
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef HAVE_STDINT_H
 # include <stdint.h>
 #endif
 
 #include "g10lib.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
 #include "sha1.h"
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX indicates whether to compile with Intel AVX code. */
 #undef USE_AVX
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX 1
 #endif
 
 /* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */
 #undef USE_BMI2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
     defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_BMI2 1
 #endif
 
 /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
 #undef USE_SHAEXT
 #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
     defined(HAVE_GCC_INLINE_ASM_SSE41) && \
     defined(ENABLE_SHAEXT_SUPPORT)
 # define USE_SHAEXT 1
 #endif
 
 /* USE_NEON indicates whether to enable ARM NEON assembly code. */
 #undef USE_NEON
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_NEON 1
 # endif
 #endif
 
 /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
  * code. */
 #undef USE_ARM_CE
 #ifdef ENABLE_ARM_CRYPTO_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
 #  define USE_ARM_CE 1
 # elif defined(__AARCH64EL__) \
        && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
        && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 #  define USE_ARM_CE 1
 # endif
 #endif
 
 /* A macro to test whether P is properly aligned for an u32 type.
    Note that config.h provides a suitable replacement for uintptr_t if
    it does not exist in stdint.h.  */
 /* #if __GNUC__ >= 2 */
 /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % __alignof__ (u32))) */
 /* #else */
 /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % sizeof (u32))) */
 /* #endif */
 
 
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \
+    defined(USE_SHAEXT)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
+#ifdef USE_SSSE3
+unsigned int
+_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data,
+                                  size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_AVX
+unsigned int
+_gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data,
+                                 size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_avx (void *ctx, const unsigned char *data,
+                             size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_BMI2
+unsigned int
+_gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
+                                     size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
+                                  size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_SHAEXT
+/* Does not need ASM_FUNC_ABI */
+unsigned int
+_gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data,
+                                   size_t nblks);
+
 static unsigned int
-transform (void *c, const unsigned char *data, size_t nblks);
+do_sha1_transform_intel_shaext (void *ctx, const unsigned char *data,
+                                size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef USE_NEON
+unsigned int
+_gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data,
+                                 size_t nblks);
+
+static unsigned int
+do_sha1_transform_armv7_neon (void *ctx, const unsigned char *data,
+                              size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef USE_ARM_CE
+unsigned int
+_gcry_sha1_transform_armv8_ce (void *state, const unsigned char *data,
+                               size_t nblks);
+
+static unsigned int
+do_sha1_transform_armv8_ce (void *ctx, const unsigned char *data,
+                            size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
+}
+#endif
+
+
+static unsigned int
+do_transform_generic (void *c, const unsigned char *data, size_t nblks);
 
 
 static void
 sha1_init (void *context, unsigned int flags)
 {
   SHA1_CONTEXT *hd = context;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = 0x67452301;
   hd->h1 = 0xefcdab89;
   hd->h2 = 0x98badcfe;
   hd->h3 = 0x10325476;
   hd->h4 = 0xc3d2e1f0;
 
   hd->bctx.nblocks = 0;
   hd->bctx.nblocks_high = 0;
   hd->bctx.count = 0;
   hd->bctx.blocksize = 64;
-  hd->bctx.bwrite = transform;
 
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  hd->bctx.bwrite = do_transform_generic;
 #ifdef USE_SSSE3
-  hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    hd->bctx.bwrite = do_sha1_transform_amd64_ssse3;
 #endif
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
-  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    hd->bctx.bwrite = do_sha1_transform_amd64_avx;
 #endif
 #ifdef USE_BMI2
-  hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2);
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2;
 #endif
 #ifdef USE_SHAEXT
-  hd->use_shaext = (features & HWF_INTEL_SHAEXT)
-                   && (features & HWF_INTEL_SSE4_1);
+  if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
+    hd->bctx.bwrite = do_sha1_transform_intel_shaext;
 #endif
 #ifdef USE_NEON
-  hd->use_neon = (features & HWF_ARM_NEON) != 0;
+  if ((features & HWF_ARM_NEON) != 0)
+    hd->bctx.bwrite = do_sha1_transform_armv7_neon;
 #endif
 #ifdef USE_ARM_CE
-  hd->use_arm_ce = (features & HWF_ARM_SHA1) != 0;
+  if ((features & HWF_ARM_SHA1) != 0)
+    hd->bctx.bwrite = do_sha1_transform_armv8_ce;
 #endif
+
   (void)features;
 }
 
 /*
  * Initialize the context HD. This is used to prepare the use of
  * _gcry_sha1_mixblock.  WARNING: This is a special purpose function
  * for exclusive use by random-csprng.c.
  */
 void
 _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd)
 {
   sha1_init (hd, 0);
 }
 
 
 /* Round function macros. */
 #define K1  0x5A827999L
 #define K2  0x6ED9EBA1L
 #define K3  0x8F1BBCDCL
 #define K4  0xCA62C1D6L
 #define F1(x,y,z)   ( z ^ ( x & ( y ^ z ) ) )
 #define F2(x,y,z)   ( x ^ y ^ z )
 #define F3(x,y,z)   ( ( x & y ) | ( z & ( x | y ) ) )
 #define F4(x,y,z)   ( x ^ y ^ z )
 #define M(i) ( tm =    x[ i    &0x0f]  \
                      ^ x[(i-14)&0x0f]  \
 	 	     ^ x[(i-8) &0x0f]  \
                      ^ x[(i-3) &0x0f], \
                      (x[i&0x0f] = rol(tm, 1)))
 #define R(a,b,c,d,e,f,k,m)  do { e += rol( a, 5 )     \
 	                              + f( b, c, d )  \
 		 		      + k	      \
 			 	      + m;	      \
 				 b = rol( b, 30 );    \
 			       } while(0)
 
-
-#ifdef USE_NEON
-unsigned int
-_gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data,
-                                 size_t nblks);
-#endif
-
-#ifdef USE_ARM_CE
-unsigned int
-_gcry_sha1_transform_armv8_ce (void *state, const unsigned char *data,
-                               size_t nblks);
-#endif
-
 /*
  * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA.
  */
 static unsigned int
-transform_blk (void *ctx, const unsigned char *data)
+do_transform_generic (void *ctx, const unsigned char *data, size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
-  const u32 *idata = (const void *)data;
-  register u32 a, b, c, d, e; /* Local copies of the chaining variables.  */
-  register u32 tm;            /* Helper.  */
-  u32 x[16];                  /* The array we work on. */
+
+  do
+    {
+      const u32 *idata = (const void *)data;
+      u32 a, b, c, d, e; /* Local copies of the chaining variables.  */
+      u32 tm;            /* Helper.  */
+      u32 x[16];         /* The array we work on. */
 
 #define I(i) (x[i] = buf_get_be32(idata + i))
 
       /* Get the values of the chaining variables. */
       a = hd->h0;
       b = hd->h1;
       c = hd->h2;
       d = hd->h3;
       e = hd->h4;
 
       /* Transform. */
       R( a, b, c, d, e, F1, K1, I( 0) );
       R( e, a, b, c, d, F1, K1, I( 1) );
       R( d, e, a, b, c, F1, K1, I( 2) );
       R( c, d, e, a, b, F1, K1, I( 3) );
       R( b, c, d, e, a, F1, K1, I( 4) );
       R( a, b, c, d, e, F1, K1, I( 5) );
       R( e, a, b, c, d, F1, K1, I( 6) );
       R( d, e, a, b, c, F1, K1, I( 7) );
       R( c, d, e, a, b, F1, K1, I( 8) );
       R( b, c, d, e, a, F1, K1, I( 9) );
       R( a, b, c, d, e, F1, K1, I(10) );
       R( e, a, b, c, d, F1, K1, I(11) );
       R( d, e, a, b, c, F1, K1, I(12) );
       R( c, d, e, a, b, F1, K1, I(13) );
       R( b, c, d, e, a, F1, K1, I(14) );
       R( a, b, c, d, e, F1, K1, I(15) );
       R( e, a, b, c, d, F1, K1, M(16) );
       R( d, e, a, b, c, F1, K1, M(17) );
       R( c, d, e, a, b, F1, K1, M(18) );
       R( b, c, d, e, a, F1, K1, M(19) );
       R( a, b, c, d, e, F2, K2, M(20) );
       R( e, a, b, c, d, F2, K2, M(21) );
       R( d, e, a, b, c, F2, K2, M(22) );
       R( c, d, e, a, b, F2, K2, M(23) );
       R( b, c, d, e, a, F2, K2, M(24) );
       R( a, b, c, d, e, F2, K2, M(25) );
       R( e, a, b, c, d, F2, K2, M(26) );
       R( d, e, a, b, c, F2, K2, M(27) );
       R( c, d, e, a, b, F2, K2, M(28) );
       R( b, c, d, e, a, F2, K2, M(29) );
       R( a, b, c, d, e, F2, K2, M(30) );
       R( e, a, b, c, d, F2, K2, M(31) );
       R( d, e, a, b, c, F2, K2, M(32) );
       R( c, d, e, a, b, F2, K2, M(33) );
       R( b, c, d, e, a, F2, K2, M(34) );
       R( a, b, c, d, e, F2, K2, M(35) );
       R( e, a, b, c, d, F2, K2, M(36) );
       R( d, e, a, b, c, F2, K2, M(37) );
       R( c, d, e, a, b, F2, K2, M(38) );
       R( b, c, d, e, a, F2, K2, M(39) );
       R( a, b, c, d, e, F3, K3, M(40) );
       R( e, a, b, c, d, F3, K3, M(41) );
       R( d, e, a, b, c, F3, K3, M(42) );
       R( c, d, e, a, b, F3, K3, M(43) );
       R( b, c, d, e, a, F3, K3, M(44) );
       R( a, b, c, d, e, F3, K3, M(45) );
       R( e, a, b, c, d, F3, K3, M(46) );
       R( d, e, a, b, c, F3, K3, M(47) );
       R( c, d, e, a, b, F3, K3, M(48) );
       R( b, c, d, e, a, F3, K3, M(49) );
       R( a, b, c, d, e, F3, K3, M(50) );
       R( e, a, b, c, d, F3, K3, M(51) );
       R( d, e, a, b, c, F3, K3, M(52) );
       R( c, d, e, a, b, F3, K3, M(53) );
       R( b, c, d, e, a, F3, K3, M(54) );
       R( a, b, c, d, e, F3, K3, M(55) );
       R( e, a, b, c, d, F3, K3, M(56) );
       R( d, e, a, b, c, F3, K3, M(57) );
       R( c, d, e, a, b, F3, K3, M(58) );
       R( b, c, d, e, a, F3, K3, M(59) );
       R( a, b, c, d, e, F4, K4, M(60) );
       R( e, a, b, c, d, F4, K4, M(61) );
       R( d, e, a, b, c, F4, K4, M(62) );
       R( c, d, e, a, b, F4, K4, M(63) );
       R( b, c, d, e, a, F4, K4, M(64) );
       R( a, b, c, d, e, F4, K4, M(65) );
       R( e, a, b, c, d, F4, K4, M(66) );
       R( d, e, a, b, c, F4, K4, M(67) );
       R( c, d, e, a, b, F4, K4, M(68) );
       R( b, c, d, e, a, F4, K4, M(69) );
       R( a, b, c, d, e, F4, K4, M(70) );
       R( e, a, b, c, d, F4, K4, M(71) );
       R( d, e, a, b, c, F4, K4, M(72) );
       R( c, d, e, a, b, F4, K4, M(73) );
       R( b, c, d, e, a, F4, K4, M(74) );
       R( a, b, c, d, e, F4, K4, M(75) );
       R( e, a, b, c, d, F4, K4, M(76) );
       R( d, e, a, b, c, F4, K4, M(77) );
       R( c, d, e, a, b, F4, K4, M(78) );
       R( b, c, d, e, a, F4, K4, M(79) );
 
       /* Update the chaining variables. */
       hd->h0 += a;
       hd->h1 += b;
       hd->h2 += c;
       hd->h3 += d;
       hd->h4 += e;
 
-  return /* burn_stack */ 88+4*sizeof(void*);
-}
-
-
-/* Assembly implementations use SystemV ABI, ABI conversion and additional
- * stack to store XMM6-XMM15 needed on Win64. */
-#undef ASM_FUNC_ABI
-#undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \
-    defined(USE_SHAEXT)
-# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-#  define ASM_FUNC_ABI __attribute__((sysv_abi))
-#  define ASM_EXTRA_STACK (10 * 16)
-# else
-#  define ASM_FUNC_ABI
-#  define ASM_EXTRA_STACK 0
-# endif
-#endif
-
-
-#ifdef USE_SSSE3
-unsigned int
-_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data,
-                                  size_t nblks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_AVX
-unsigned int
-_gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data,
-                                 size_t nblks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_BMI2
-unsigned int
-_gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
-                                     size_t nblks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_SHAEXT
-/* Does not need ASM_FUNC_ABI */
-unsigned int
-_gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data,
-                                   size_t nblks);
-#endif
-
-
-static unsigned int
-transform (void *ctx, const unsigned char *data, size_t nblks)
-{
-  SHA1_CONTEXT *hd = ctx;
-  unsigned int burn;
-
-#ifdef USE_SHAEXT
-  if (hd->use_shaext)
-    {
-      burn = _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
-#ifdef USE_BMI2
-  if (hd->use_bmi2)
-    {
-      burn = _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
-#ifdef USE_AVX
-  if (hd->use_avx)
-    {
-      burn = _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
-#ifdef USE_SSSE3
-  if (hd->use_ssse3)
-    {
-      burn = _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
-#ifdef USE_ARM_CE
-  if (hd->use_arm_ce)
-    {
-      burn = _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) : 0;
-      return burn;
-    }
-#endif
-#ifdef USE_NEON
-  if (hd->use_neon)
-    {
-      burn = _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) : 0;
-      return burn;
-    }
-#endif
-
-  do
-    {
-      burn = transform_blk (hd, data);
       data += 64;
     }
   while (--nblks);
 
-#ifdef ASM_EXTRA_STACK
-  /* 'transform_blk' is typically inlined and XMM6-XMM15 are stored at
-   *  the prologue of this function. Therefore need to add ASM_EXTRA_STACK to
-   *  here too.
-   */
-  burn += ASM_EXTRA_STACK;
-#endif
-
-  return burn;
+  return 88+4*sizeof(void*);
 }
 
 
 /*
  * Apply the SHA-1 transform function on the buffer BLOCKOF64BYTE
  * which must have a length 64 bytes.  BLOCKOF64BYTE must be 32-bit
  * aligned.  Updates the 20 bytes in BLOCKOF64BYTE with its mixed
  * content.  Returns the number of bytes which should be burned on the
  * stack.  You need to use _gcry_sha1_mixblock_init to initialize the
  * context.
  * WARNING: This is a special purpose function for exclusive use by
  * random-csprng.c.
  */
 unsigned int
 _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte)
 {
   u32 *p = blockof64byte;
   unsigned int nburn;
 
-  nburn = transform (hd, blockof64byte, 1);
+  nburn = (*hd->bctx.bwrite) (hd, blockof64byte, 1);
   p[0] = hd->h0;
   p[1] = hd->h1;
   p[2] = hd->h2;
   p[3] = hd->h3;
   p[4] = hd->h4;
 
   return nburn;
 }
 
 
 /* The routine final terminates the computation and
  * returns the digest.
  * The handle is prepared for a new cycle, but adding bytes to the
  * handle will the destroy the returned buffer.
  * Returns: 20 bytes representing the digest.
  */
 
 static void
 sha1_final(void *context)
 {
   SHA1_CONTEXT *hd = context;
   u32 t, th, msb, lsb;
   unsigned char *p;
   unsigned int burn;
 
   _gcry_md_block_write (hd, NULL, 0); /* flush */;
 
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
   else
     th = hd->bctx.nblocks >> 32;
 
   /* multiply by 64 to make a byte count */
   lsb = t << 6;
   msb = (th << 6) | (t >> 26);
   /* add the count */
   t = lsb;
   if( (lsb += hd->bctx.count) < t )
     msb++;
   /* multiply by 8 to make a bit count */
   t = lsb;
   lsb <<= 3;
   msb <<= 3;
   msb |= t >> 29;
 
   if( hd->bctx.count < 56 )  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
       while( hd->bctx.count < 56 )
         hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
     }
   else  /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
       while( hd->bctx.count < 64 )
         hd->bctx.buf[hd->bctx.count++] = 0;
       _gcry_md_block_write(hd, NULL, 0);  /* flush */;
       memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
   buf_put_be32(hd->bctx.buf + 56, msb);
   buf_put_be32(hd->bctx.buf + 60, lsb);
-  burn = transform( hd, hd->bctx.buf, 1 );
+  burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
   X(0);
   X(1);
   X(2);
   X(3);
   X(4);
 #undef X
 
 }
 
 static unsigned char *
 sha1_read( void *context )
 {
   SHA1_CONTEXT *hd = context;
 
   return hd->bctx.buf;
 }
 
 /****************
  * Shortcut functions which puts the hash value of the supplied buffer
  * into outbuf which must have a size of 20 bytes.
  */
 void
 _gcry_sha1_hash_buffer (void *outbuf, const void *buffer, size_t length)
 {
   SHA1_CONTEXT hd;
 
   sha1_init (&hd, 0);
   _gcry_md_block_write (&hd, buffer, length);
   sha1_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 20);
 }
 
 
 /* Variant of the above shortcut function using a multiple buffers.  */
 void
 _gcry_sha1_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 {
   SHA1_CONTEXT hd;
 
   sha1_init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     _gcry_md_block_write (&hd,
                           (const char*)iov[0].data + iov[0].off, iov[0].len);
   sha1_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 20);
 }
 
 
 
 /*
      Self-test section.
  */
 
 
 static gpg_err_code_t
 selftests_sha1 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA1, 0,
      "abc", 3,
      "\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E"
      "\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D", 20);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA1, 0,
          "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
          "\x84\x98\x3E\x44\x1C\x3B\xD2\x6E\xBA\xAE"
          "\x4A\xA1\xF9\x51\x29\xE5\xE5\x46\x70\xF1", 20);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA1, 1,
          NULL, 0,
          "\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E"
          "\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F", 20);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA1, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
   gpg_err_code_t ec;
 
   switch (algo)
     {
     case GCRY_MD_SHA1:
       ec = selftests_sha1 (extended, report);
       break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
 
     }
   return ec;
 }
 
 
 
 
 static unsigned char asn[15] = /* Object ID is 1.3.14.3.2.26 */
   { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03,
     0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 };
 
 static gcry_md_oid_spec_t oid_spec_sha1[] =
   {
     /* iso.member-body.us.rsadsi.pkcs.pkcs-1.5 (sha1WithRSAEncryption) */
     { "1.2.840.113549.1.1.5" },
     /* iso.member-body.us.x9-57.x9cm.3 (dsaWithSha1)*/
     { "1.2.840.10040.4.3" },
     /* from NIST's OIW  (sha1) */
     { "1.3.14.3.2.26" },
     /* from NIST OIW (sha-1WithRSAEncryption) */
     { "1.3.14.3.2.29" },
     /* iso.member-body.us.ansi-x9-62.signatures.ecdsa-with-sha1 */
     { "1.2.840.10045.4.1" },
     { NULL },
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha1 =
   {
     GCRY_MD_SHA1, {0, 1},
     "SHA1", asn, DIM (asn), oid_spec_sha1, 20,
     sha1_init, _gcry_md_block_write, sha1_final, sha1_read, NULL,
     _gcry_sha1_hash_buffer, _gcry_sha1_hash_buffers,
     sizeof (SHA1_CONTEXT),
     run_selftests
   };
diff --git a/cipher/sha1.h b/cipher/sha1.h
index 93ce79b5..acf764ba 100644
--- a/cipher/sha1.h
+++ b/cipher/sha1.h
@@ -1,41 +1,35 @@
 /* sha1.h - SHA-1 context definition
  * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef GCRY_SHA1_H
 #define GCRY_SHA1_H
 
 #include "hash-common.h"
 
 /* We need this here for direct use by random-csprng.c. */
 typedef struct
 {
   gcry_md_block_ctx_t bctx;
   u32          h0,h1,h2,h3,h4;
-  unsigned int use_ssse3:1;
-  unsigned int use_avx:1;
-  unsigned int use_bmi2:1;
-  unsigned int use_shaext:1;
-  unsigned int use_neon:1;
-  unsigned int use_arm_ce:1;
 } SHA1_CONTEXT;
 
 
 void _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd);
 unsigned int _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte);
 
 #endif /*GCRY_SHA1_H*/
diff --git a/cipher/sha256.c b/cipher/sha256.c
index 06959707..e82a9d90 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -1,788 +1,769 @@
 /* sha256.c - SHA256 hash function
  * Copyright (C) 2003, 2006, 2008, 2009 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 
 /*  Test vectors:
 
     "abc"
     SHA224: 23097d22 3405d822 8642a477 bda255b3 2aadbce4 bda0b3f7 e36c9da7
     SHA256: ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad
 
     "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
     SHA224: 75388b16 512776cc 5dba5da1 fd890150 b0c6455c b4f58b19 52522525
     SHA256: 248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1
 
     "a" one million times
     SHA224: 20794655 980c91d8 bbb4c1ea 97618a4b f03f4258 1948b2ee 4ee7ad67
     SHA256: cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0
 
  */
 
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "g10lib.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX indicates whether to compile with Intel AVX code. */
 #undef USE_AVX
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
     defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX2 1
 #endif
 
 /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
 #undef USE_SHAEXT
 #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
     defined(HAVE_GCC_INLINE_ASM_SSE41) && \
     defined(ENABLE_SHAEXT_SUPPORT)
 # define USE_SHAEXT 1
 #endif
 
 /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
  * code. */
 #undef USE_ARM_CE
 #ifdef ENABLE_ARM_CRYPTO_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
 #  define USE_ARM_CE 1
 # elif defined(__AARCH64EL__) \
        && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
        && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
 #  define USE_ARM_CE 1
 # endif
 #endif
 
 
 typedef struct {
   gcry_md_block_ctx_t bctx;
   u32  h0,h1,h2,h3,h4,h5,h6,h7;
+} SHA256_CONTEXT;
+
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \
+    defined(USE_SHAEXT)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
 #ifdef USE_SSSE3
-  unsigned int use_ssse3:1;
+unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data,
+                                                u32 state[8],
+                                                size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha256_transform_amd64_ssse3(void *ctx, const unsigned char *data,
+                                size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks)
+         + ASM_EXTRA_STACK;
+}
 #endif
+
 #ifdef USE_AVX
-  unsigned int use_avx:1;
+unsigned int _gcry_sha256_transform_amd64_avx(const void *input_data,
+                                              u32 state[8],
+                                              size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha256_transform_amd64_avx(void *ctx, const unsigned char *data,
+                              size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks)
+         + ASM_EXTRA_STACK;
+}
 #endif
+
 #ifdef USE_AVX2
-  unsigned int use_avx2:1;
+unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data,
+                                               u32 state[8],
+                                               size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha256_transform_amd64_avx2(void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks)
+         + ASM_EXTRA_STACK;
+}
 #endif
+
 #ifdef USE_SHAEXT
-  unsigned int use_shaext:1;
+/* Does not need ASM_FUNC_ABI */
+unsigned int
+_gcry_sha256_transform_intel_shaext(u32 state[8],
+                                    const unsigned char *input_data,
+                                    size_t num_blks);
+
+static unsigned int
+do_sha256_transform_intel_shaext(void *ctx, const unsigned char *data,
+                                 size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_intel_shaext (&hd->h0, data, nblks);
+}
 #endif
+
 #ifdef USE_ARM_CE
-  unsigned int use_arm_ce:1;
+unsigned int _gcry_sha256_transform_armv8_ce(u32 state[8],
+                                             const void *input_data,
+                                             size_t num_blks);
+
+static unsigned int
+do_sha256_transform_armv8_ce(void *ctx, const unsigned char *data,
+                             size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks);
+}
 #endif
-} SHA256_CONTEXT;
 
 
 static unsigned int
-transform (void *c, const unsigned char *data, size_t nblks);
+do_transform_generic (void *ctx, const unsigned char *data, size_t nblks);
 
 
 static void
 sha256_init (void *context, unsigned int flags)
 {
   SHA256_CONTEXT *hd = context;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = 0x6a09e667;
   hd->h1 = 0xbb67ae85;
   hd->h2 = 0x3c6ef372;
   hd->h3 = 0xa54ff53a;
   hd->h4 = 0x510e527f;
   hd->h5 = 0x9b05688c;
   hd->h6 = 0x1f83d9ab;
   hd->h7 = 0x5be0cd19;
 
   hd->bctx.nblocks = 0;
   hd->bctx.nblocks_high = 0;
   hd->bctx.count = 0;
   hd->bctx.blocksize = 64;
-  hd->bctx.bwrite = transform;
 
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  hd->bctx.bwrite = do_transform_generic;
 #ifdef USE_SSSE3
-  hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    hd->bctx.bwrite = do_sha256_transform_amd64_ssse3;
 #endif
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
-  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    hd->bctx.bwrite = do_sha256_transform_amd64_avx;
 #endif
 #ifdef USE_AVX2
-  hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sha256_transform_amd64_avx2;
 #endif
 #ifdef USE_SHAEXT
-  hd->use_shaext = (features & HWF_INTEL_SHAEXT)
-                   && (features & HWF_INTEL_SSE4_1);
+  if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
+    hd->bctx.bwrite = do_sha256_transform_intel_shaext;
 #endif
 #ifdef USE_ARM_CE
-  hd->use_arm_ce = (features & HWF_ARM_SHA2) != 0;
+  if ((features & HWF_ARM_SHA2) != 0)
+    hd->bctx.bwrite = do_sha256_transform_armv8_ce;
 #endif
   (void)features;
 }
 
 
 static void
 sha224_init (void *context, unsigned int flags)
 {
   SHA256_CONTEXT *hd = context;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = 0xc1059ed8;
   hd->h1 = 0x367cd507;
   hd->h2 = 0x3070dd17;
   hd->h3 = 0xf70e5939;
   hd->h4 = 0xffc00b31;
   hd->h5 = 0x68581511;
   hd->h6 = 0x64f98fa7;
   hd->h7 = 0xbefa4fa4;
 
   hd->bctx.nblocks = 0;
   hd->bctx.nblocks_high = 0;
   hd->bctx.count = 0;
   hd->bctx.blocksize = 64;
-  hd->bctx.bwrite = transform;
 
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  hd->bctx.bwrite = do_transform_generic;
 #ifdef USE_SSSE3
-  hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    hd->bctx.bwrite = do_sha256_transform_amd64_ssse3;
 #endif
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
-  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    hd->bctx.bwrite = do_sha256_transform_amd64_avx;
 #endif
 #ifdef USE_AVX2
-  hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sha256_transform_amd64_avx2;
 #endif
 #ifdef USE_SHAEXT
-  hd->use_shaext = (features & HWF_INTEL_SHAEXT)
-                   && (features & HWF_INTEL_SSE4_1);
+  if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
+    hd->bctx.bwrite = do_sha256_transform_intel_shaext;
 #endif
 #ifdef USE_ARM_CE
-  hd->use_arm_ce = (features & HWF_ARM_SHA2) != 0;
+  if ((features & HWF_ARM_SHA2) != 0)
+    hd->bctx.bwrite = do_sha256_transform_armv8_ce;
 #endif
   (void)features;
 }
 
 
 /*
   Transform the message X which consists of 16 32-bit-words. See FIPS
   180-2 for details.  */
 #define R(a,b,c,d,e,f,g,h,k,w) do                                 \
           {                                                       \
             t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + (k) + (w);  \
             t2 = Sum0((a)) + Maj((a),(b),(c));                    \
             d += t1;                                              \
             h  = t1 + t2;                                         \
           } while (0)
 
 /* (4.2) same as SHA-1's F1.  */
 #define Cho(x, y, z)  (z ^ (x & (y ^ z)))
 
 /* (4.3) same as SHA-1's F3 */
 #define Maj(x, y, z)  ((x & y) + (z & (x ^ y)))
 
 /* (4.4) */
 #define Sum0(x)       (ror (x, 2) ^ ror (x, 13) ^ ror (x, 22))
 
 /* (4.5) */
 #define Sum1(x)       (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25))
 
 /* Message expansion */
 #define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))       /* (4.6) */
 #define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))     /* (4.7) */
 #define I(i) ( w[i] = buf_get_be32(data + i * 4) )
 #define W(i) ( w[i&0x0f] =    S1(w[(i-2) &0x0f]) \
                             +    w[(i-7) &0x0f]  \
                             + S0(w[(i-15)&0x0f]) \
                             +    w[(i-16)&0x0f] )
 
 static unsigned int
-transform_blk (void *ctx, const unsigned char *data)
+do_transform_generic (void *ctx, const unsigned char *data, size_t nblks)
 {
   SHA256_CONTEXT *hd = ctx;
   static const u32 K[64] = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
   };
 
-  u32 a,b,c,d,e,f,g,h,t1,t2;
-  u32 w[16];
-
-  a = hd->h0;
-  b = hd->h1;
-  c = hd->h2;
-  d = hd->h3;
-  e = hd->h4;
-  f = hd->h5;
-  g = hd->h6;
-  h = hd->h7;
-
-  R(a, b, c, d, e, f, g, h, K[0], I(0));
-  R(h, a, b, c, d, e, f, g, K[1], I(1));
-  R(g, h, a, b, c, d, e, f, K[2], I(2));
-  R(f, g, h, a, b, c, d, e, K[3], I(3));
-  R(e, f, g, h, a, b, c, d, K[4], I(4));
-  R(d, e, f, g, h, a, b, c, K[5], I(5));
-  R(c, d, e, f, g, h, a, b, K[6], I(6));
-  R(b, c, d, e, f, g, h, a, K[7], I(7));
-  R(a, b, c, d, e, f, g, h, K[8], I(8));
-  R(h, a, b, c, d, e, f, g, K[9], I(9));
-  R(g, h, a, b, c, d, e, f, K[10], I(10));
-  R(f, g, h, a, b, c, d, e, K[11], I(11));
-  R(e, f, g, h, a, b, c, d, K[12], I(12));
-  R(d, e, f, g, h, a, b, c, K[13], I(13));
-  R(c, d, e, f, g, h, a, b, K[14], I(14));
-  R(b, c, d, e, f, g, h, a, K[15], I(15));
-
-  R(a, b, c, d, e, f, g, h, K[16], W(16));
-  R(h, a, b, c, d, e, f, g, K[17], W(17));
-  R(g, h, a, b, c, d, e, f, K[18], W(18));
-  R(f, g, h, a, b, c, d, e, K[19], W(19));
-  R(e, f, g, h, a, b, c, d, K[20], W(20));
-  R(d, e, f, g, h, a, b, c, K[21], W(21));
-  R(c, d, e, f, g, h, a, b, K[22], W(22));
-  R(b, c, d, e, f, g, h, a, K[23], W(23));
-  R(a, b, c, d, e, f, g, h, K[24], W(24));
-  R(h, a, b, c, d, e, f, g, K[25], W(25));
-  R(g, h, a, b, c, d, e, f, K[26], W(26));
-  R(f, g, h, a, b, c, d, e, K[27], W(27));
-  R(e, f, g, h, a, b, c, d, K[28], W(28));
-  R(d, e, f, g, h, a, b, c, K[29], W(29));
-  R(c, d, e, f, g, h, a, b, K[30], W(30));
-  R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-  R(a, b, c, d, e, f, g, h, K[32], W(32));
-  R(h, a, b, c, d, e, f, g, K[33], W(33));
-  R(g, h, a, b, c, d, e, f, K[34], W(34));
-  R(f, g, h, a, b, c, d, e, K[35], W(35));
-  R(e, f, g, h, a, b, c, d, K[36], W(36));
-  R(d, e, f, g, h, a, b, c, K[37], W(37));
-  R(c, d, e, f, g, h, a, b, K[38], W(38));
-  R(b, c, d, e, f, g, h, a, K[39], W(39));
-  R(a, b, c, d, e, f, g, h, K[40], W(40));
-  R(h, a, b, c, d, e, f, g, K[41], W(41));
-  R(g, h, a, b, c, d, e, f, K[42], W(42));
-  R(f, g, h, a, b, c, d, e, K[43], W(43));
-  R(e, f, g, h, a, b, c, d, K[44], W(44));
-  R(d, e, f, g, h, a, b, c, K[45], W(45));
-  R(c, d, e, f, g, h, a, b, K[46], W(46));
-  R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-  R(a, b, c, d, e, f, g, h, K[48], W(48));
-  R(h, a, b, c, d, e, f, g, K[49], W(49));
-  R(g, h, a, b, c, d, e, f, K[50], W(50));
-  R(f, g, h, a, b, c, d, e, K[51], W(51));
-  R(e, f, g, h, a, b, c, d, K[52], W(52));
-  R(d, e, f, g, h, a, b, c, K[53], W(53));
-  R(c, d, e, f, g, h, a, b, K[54], W(54));
-  R(b, c, d, e, f, g, h, a, K[55], W(55));
-  R(a, b, c, d, e, f, g, h, K[56], W(56));
-  R(h, a, b, c, d, e, f, g, K[57], W(57));
-  R(g, h, a, b, c, d, e, f, K[58], W(58));
-  R(f, g, h, a, b, c, d, e, K[59], W(59));
-  R(e, f, g, h, a, b, c, d, K[60], W(60));
-  R(d, e, f, g, h, a, b, c, K[61], W(61));
-  R(c, d, e, f, g, h, a, b, K[62], W(62));
-  R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-  hd->h0 += a;
-  hd->h1 += b;
-  hd->h2 += c;
-  hd->h3 += d;
-  hd->h4 += e;
-  hd->h5 += f;
-  hd->h6 += g;
-  hd->h7 += h;
-
-  return /*burn_stack*/ 26*4+32;
-}
-#undef S0
-#undef S1
-#undef R
-
-
-/* Assembly implementations use SystemV ABI, ABI conversion and additional
- * stack to store XMM6-XMM15 needed on Win64. */
-#undef ASM_FUNC_ABI
-#undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \
-    defined(USE_SHAEXT)
-# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-#  define ASM_FUNC_ABI __attribute__((sysv_abi))
-#  define ASM_EXTRA_STACK (10 * 16)
-# else
-#  define ASM_FUNC_ABI
-#  define ASM_EXTRA_STACK 0
-# endif
-#endif
-
-
-#ifdef USE_SSSE3
-unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data,
-                                                u32 state[8],
-                                                size_t num_blks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_AVX
-unsigned int _gcry_sha256_transform_amd64_avx(const void *input_data,
-                                              u32 state[8],
-                                              size_t num_blks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_AVX2
-unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data,
-                                               u32 state[8],
-                                               size_t num_blks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_SHAEXT
-/* Does not need ASM_FUNC_ABI */
-unsigned int
-_gcry_sha256_transform_intel_shaext(u32 state[8],
-                                    const unsigned char *input_data,
-                                    size_t num_blks);
-#endif
-
-#ifdef USE_ARM_CE
-unsigned int _gcry_sha256_transform_armv8_ce(u32 state[8],
-                                             const void *input_data,
-                                             size_t num_blks);
-#endif
-
-static unsigned int
-transform (void *ctx, const unsigned char *data, size_t nblks)
-{
-  SHA256_CONTEXT *hd = ctx;
-  unsigned int burn;
-
-#ifdef USE_SHAEXT
-  if (hd->use_shaext)
-    {
-      burn = _gcry_sha256_transform_intel_shaext (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
-
-#ifdef USE_AVX2
-  if (hd->use_avx2)
-    {
-      burn = _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
-
-#ifdef USE_AVX
-  if (hd->use_avx)
-    {
-      burn = _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
-
-#ifdef USE_SSSE3
-  if (hd->use_ssse3)
+  do
     {
-      burn = _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks);
-      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
-      return burn;
-    }
-#endif
 
-#ifdef USE_ARM_CE
-  if (hd->use_arm_ce)
-    {
-      burn = _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks);
-      burn += burn ? 4 * sizeof(void*) : 0;
-      return burn;
-    }
-#endif
+      u32 a,b,c,d,e,f,g,h,t1,t2;
+      u32 w[16];
+
+      a = hd->h0;
+      b = hd->h1;
+      c = hd->h2;
+      d = hd->h3;
+      e = hd->h4;
+      f = hd->h5;
+      g = hd->h6;
+      h = hd->h7;
+
+      R(a, b, c, d, e, f, g, h, K[0], I(0));
+      R(h, a, b, c, d, e, f, g, K[1], I(1));
+      R(g, h, a, b, c, d, e, f, K[2], I(2));
+      R(f, g, h, a, b, c, d, e, K[3], I(3));
+      R(e, f, g, h, a, b, c, d, K[4], I(4));
+      R(d, e, f, g, h, a, b, c, K[5], I(5));
+      R(c, d, e, f, g, h, a, b, K[6], I(6));
+      R(b, c, d, e, f, g, h, a, K[7], I(7));
+      R(a, b, c, d, e, f, g, h, K[8], I(8));
+      R(h, a, b, c, d, e, f, g, K[9], I(9));
+      R(g, h, a, b, c, d, e, f, K[10], I(10));
+      R(f, g, h, a, b, c, d, e, K[11], I(11));
+      R(e, f, g, h, a, b, c, d, K[12], I(12));
+      R(d, e, f, g, h, a, b, c, K[13], I(13));
+      R(c, d, e, f, g, h, a, b, K[14], I(14));
+      R(b, c, d, e, f, g, h, a, K[15], I(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      hd->h0 += a;
+      hd->h1 += b;
+      hd->h2 += c;
+      hd->h3 += d;
+      hd->h4 += e;
+      hd->h5 += f;
+      hd->h6 += g;
+      hd->h7 += h;
 
-  do
-    {
-      burn = transform_blk (hd, data);
       data += 64;
     }
   while (--nblks);
 
-#ifdef ASM_EXTRA_STACK
-  /* 'transform_blk' is typically inlined and XMM6-XMM15 are stored at
-   *  the prologue of this function. Therefore need to add ASM_EXTRA_STACK to
-   *  here too.
-   */
-  burn += ASM_EXTRA_STACK;
-#endif
-
-  return burn;
+  return 26*4 + 32 + 3 * sizeof(void*);
 }
 
+#undef S0
+#undef S1
+#undef R
+
 
 /*
    The routine finally terminates the computation and returns the
    digest.  The handle is prepared for a new cycle, but adding bytes
    to the handle will the destroy the returned buffer.  Returns: 32
    bytes with the message the digest.  */
 static void
 sha256_final(void *context)
 {
   SHA256_CONTEXT *hd = context;
   u32 t, th, msb, lsb;
   byte *p;
   unsigned int burn;
 
   _gcry_md_block_write (hd, NULL, 0); /* flush */;
 
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
   else
     th = hd->bctx.nblocks >> 32;
 
   /* multiply by 64 to make a byte count */
   lsb = t << 6;
   msb = (th << 6) | (t >> 26);
   /* add the count */
   t = lsb;
   if ((lsb += hd->bctx.count) < t)
     msb++;
   /* multiply by 8 to make a bit count */
   t = lsb;
   lsb <<= 3;
   msb <<= 3;
   msb |= t >> 29;
 
   if (hd->bctx.count < 56)
     { /* enough room */
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
       while (hd->bctx.count < 56)
         hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
     }
   else
     { /* need one extra block */
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
       while (hd->bctx.count < 64)
         hd->bctx.buf[hd->bctx.count++] = 0;
       _gcry_md_block_write (hd, NULL, 0);  /* flush */;
       memset (hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
   buf_put_be32(hd->bctx.buf + 56, msb);
   buf_put_be32(hd->bctx.buf + 60, lsb);
-  burn = transform (hd, hd->bctx.buf, 1);
+  burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1);
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
   X(0);
   X(1);
   X(2);
   X(3);
   X(4);
   X(5);
   X(6);
   X(7);
 #undef X
 }
 
 static byte *
 sha256_read (void *context)
 {
   SHA256_CONTEXT *hd = context;
 
   return hd->bctx.buf;
 }
 
 
 /* Shortcut functions which puts the hash value of the supplied buffer
  * into outbuf which must have a size of 32 bytes.  */
 void
 _gcry_sha256_hash_buffer (void *outbuf, const void *buffer, size_t length)
 {
   SHA256_CONTEXT hd;
 
   sha256_init (&hd, 0);
   _gcry_md_block_write (&hd, buffer, length);
   sha256_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 32);
 }
 
 
 /* Variant of the above shortcut function using multiple buffers.  */
 void
 _gcry_sha256_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 {
   SHA256_CONTEXT hd;
 
   sha256_init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     _gcry_md_block_write (&hd,
                           (const char*)iov[0].data + iov[0].off, iov[0].len);
   sha256_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 32);
 }
 
 
 /* Shortcut functions which puts the hash value of the supplied buffer
  * into outbuf which must have a size of 28 bytes.  */
 static void
 _gcry_sha224_hash_buffer (void *outbuf, const void *buffer, size_t length)
 {
   SHA256_CONTEXT hd;
 
   sha224_init (&hd, 0);
   _gcry_md_block_write (&hd, buffer, length);
   sha256_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 28);
 }
 
 
 /* Variant of the above shortcut function using multiple buffers.  */
 static void
 _gcry_sha224_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 {
   SHA256_CONTEXT hd;
 
   sha224_init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     _gcry_md_block_write (&hd,
                           (const char*)iov[0].data + iov[0].off, iov[0].len);
   sha256_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 28);
 }
 
 
 
 /*
      Self-test section.
  */
 
 
 static gpg_err_code_t
 selftests_sha224 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA224, 0,
      "abc", 3,
      "\x23\x09\x7d\x22\x34\x05\xd8\x22\x86\x42\xa4\x77\xbd\xa2\x55\xb3"
      "\x2a\xad\xbc\xe4\xbd\xa0\xb3\xf7\xe3\x6c\x9d\xa7", 28);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA224, 0,
          "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
          "\x75\x38\x8b\x16\x51\x27\x76\xcc\x5d\xba\x5d\xa1\xfd\x89\x01\x50"
          "\xb0\xc6\x45\x5c\xb4\xf5\x8b\x19\x52\x52\x25\x25", 28);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA224, 1,
          NULL, 0,
          "\x20\x79\x46\x55\x98\x0c\x91\xd8\xbb\xb4\xc1\xea\x97\x61\x8a\x4b"
          "\xf0\x3f\x42\x58\x19\x48\xb2\xee\x4e\xe7\xad\x67", 28);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA224, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 static gpg_err_code_t
 selftests_sha256 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA256, 0,
      "abc", 3,
      "\xba\x78\x16\xbf\x8f\x01\xcf\xea\x41\x41\x40\xde\x5d\xae\x22\x23"
      "\xb0\x03\x61\xa3\x96\x17\x7a\x9c\xb4\x10\xff\x61\xf2\x00\x15\xad", 32);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA256, 0,
          "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
          "\x24\x8d\x6a\x61\xd2\x06\x38\xb8\xe5\xc0\x26\x93\x0c\x3e\x60\x39"
          "\xa3\x3c\xe4\x59\x64\xff\x21\x67\xf6\xec\xed\xd4\x19\xdb\x06\xc1",
          32);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA256, 1,
          NULL, 0,
          "\xcd\xc7\x6e\x5c\x99\x14\xfb\x92\x81\xa1\xc7\xe2\x84\xd7\x3e\x67"
          "\xf1\x80\x9a\x48\xa4\x97\x20\x0e\x04\x6d\x39\xcc\xc7\x11\x2c\xd0",
          32);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA256, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
   gpg_err_code_t ec;
 
   switch (algo)
     {
     case GCRY_MD_SHA224:
       ec = selftests_sha224 (extended, report);
       break;
     case GCRY_MD_SHA256:
       ec = selftests_sha256 (extended, report);
       break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
 
     }
   return ec;
 }
 
 
 
 
 static byte asn224[19] = /* Object ID is 2.16.840.1.101.3.4.2.4 */
   { 0x30, 0x2D, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
     0x01, 0x65, 0x03, 0x04, 0x02, 0x04, 0x05, 0x00, 0x04,
     0x1C
   };
 
 static gcry_md_oid_spec_t oid_spec_sha224[] =
   {
     /* From RFC3874, Section 4 */
     { "2.16.840.1.101.3.4.2.4" },
     { NULL },
   };
 
 static byte asn256[19] = /* Object ID is  2.16.840.1.101.3.4.2.1 */
   { 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
     0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0x05,
     0x00, 0x04, 0x20 };
 
 static gcry_md_oid_spec_t oid_spec_sha256[] =
   {
     /* According to the OpenPGP draft rfc2440-bis06 */
     { "2.16.840.1.101.3.4.2.1" },
     /* PKCS#1 sha256WithRSAEncryption */
     { "1.2.840.113549.1.1.11" },
 
     { NULL },
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha224 =
   {
     GCRY_MD_SHA224, {0, 1},
     "SHA224", asn224, DIM (asn224), oid_spec_sha224, 28,
     sha224_init, _gcry_md_block_write, sha256_final, sha256_read, NULL,
     _gcry_sha224_hash_buffer, _gcry_sha224_hash_buffers,
     sizeof (SHA256_CONTEXT),
     run_selftests
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha256 =
   {
     GCRY_MD_SHA256, {0, 1},
     "SHA256", asn256, DIM (asn256), oid_spec_sha256, 32,
     sha256_init, _gcry_md_block_write, sha256_final, sha256_read, NULL,
     _gcry_sha256_hash_buffer, _gcry_sha256_hash_buffers,
     sizeof (SHA256_CONTEXT),
     run_selftests
   };
diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S
index a9d12724..6596f2cd 100644
--- a/cipher/sha512-armv7-neon.S
+++ b/cipher/sha512-armv7-neon.S
@@ -1,449 +1,450 @@
 /* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
  *
  * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <config.h>
 
 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
     defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_NEON)
 
 .text
 
 .syntax unified
 .fpu neon
 .arm
 
 /* structure of SHA512_CONTEXT */
 #define hd_a 0
 #define hd_b ((hd_a) + 8)
 #define hd_c ((hd_b) + 8)
 #define hd_d ((hd_c) + 8)
 #define hd_e ((hd_d) + 8)
 #define hd_f ((hd_e) + 8)
 #define hd_g ((hd_f) + 8)
 
 /* register macros */
 #define RK %r2
 
 #define RA d0
 #define RB d1
 #define RC d2
 #define RD d3
 #define RE d4
 #define RF d5
 #define RG d6
 #define RH d7
 
 #define RT0 d8
 #define RT1 d9
 #define RT2 d10
 #define RT3 d11
 #define RT4 d12
 #define RT5 d13
 #define RT6 d14
 #define RT7 d15
 
 #define RT01q q4
 #define RT23q q5
 #define RT45q q6
 #define RT67q q7
 
 #define RW0 d16
 #define RW1 d17
 #define RW2 d18
 #define RW3 d19
 #define RW4 d20
 #define RW5 d21
 #define RW6 d22
 #define RW7 d23
 #define RW8 d24
 #define RW9 d25
 #define RW10 d26
 #define RW11 d27
 #define RW12 d28
 #define RW13 d29
 #define RW14 d30
 #define RW15 d31
 
 #define RW01q q8
 #define RW23q q9
 #define RW45q q10
 #define RW67q q11
 #define RW89q q12
 #define RW1011q q13
 #define RW1213q q14
 #define RW1415q q15
 
 /***********************************************************************
  * ARM assembly implementation of sha512 transform
  ***********************************************************************/
 #define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
 	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
 	vshr.u64 RT2, re, #14; \
 	vshl.u64 RT3, re, #64 - 14; \
 	interleave_op(arg1); \
 	vshr.u64 RT4, re, #18; \
 	vshl.u64 RT5, re, #64 - 18; \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, re, #41; \
 	vshl.u64 RT5, re, #64 - 41; \
 	vadd.u64 RT0, RT0, rw0; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, re; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, rf, rg; \
 	\
 	vadd.u64 RT1, RT1, rh; \
 	vshr.u64 RT2, ra, #28; \
 	vshl.u64 RT3, ra, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, ra, #34; \
 	vshl.u64 RT5, ra, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* h = Sum0 (a) + Maj (a, b, c); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, ra, #39; \
 	vshl.u64 RT5, ra, #64 - 39; \
 	veor.64 RT0, ra, rb; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rc, rb; \
 	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
 	veor.64 rh, RT2, RT3; \
 	\
 	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
 	vshr.u64 RT2, rd, #14; \
 	vshl.u64 RT3, rd, #64 - 14; \
 	vadd.u64 rh, rh, RT0; \
 	vshr.u64 RT4, rd, #18; \
 	vshl.u64 RT5, rd, #64 - 18; \
 	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rd, #41; \
 	vshl.u64 RT5, rd, #64 - 41; \
 	vadd.u64 RT0, RT0, rw1; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, rd; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, re, rf; \
 	\
 	vadd.u64 RT1, RT1, rg; \
 	vshr.u64 RT2, rh, #28; \
 	vshl.u64 RT3, rh, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, rh, #34; \
 	vshl.u64 RT5, rh, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* g = Sum0 (h) + Maj (h, a, b); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rh, #39; \
 	vshl.u64 RT5, rh, #64 - 39; \
 	veor.64 RT0, rh, ra; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rb, ra; \
 	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
 	veor.64 rg, RT2, RT3; \
 	\
 	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
 	/* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
 	\
 	/**** S0(w[1:2]) */ \
 	\
 	/* w[0:1] += w[9:10] */ \
 	/* RT23q = rw1:rw2 */ \
 	vext.u64 RT23q, rw01q, rw23q, #1; \
 	vadd.u64 rw0, rw9; \
 	vadd.u64 rg, rg, RT0; \
 	vadd.u64 rw1, rw10;\
 	vadd.u64 rg, rg, RT1; /* g+=t1; */ \
 	\
 	vshr.u64 RT45q, RT23q, #1; \
 	vshl.u64 RT67q, RT23q, #64 - 1; \
 	vshr.u64 RT01q, RT23q, #8; \
 	veor.u64 RT45q, RT45q, RT67q; \
 	vshl.u64 RT67q, RT23q, #64 - 8; \
 	veor.u64 RT45q, RT45q, RT01q; \
 	vshr.u64 RT01q, RT23q, #7; \
 	veor.u64 RT45q, RT45q, RT67q; \
 	\
 	/**** S1(w[14:15]) */ \
 	vshr.u64 RT23q, rw1415q, #6; \
 	veor.u64 RT01q, RT01q, RT45q; \
 	vshr.u64 RT45q, rw1415q, #19; \
 	vshl.u64 RT67q, rw1415q, #64 - 19; \
 	veor.u64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT45q, rw1415q, #61; \
 	veor.u64 RT23q, RT23q, RT67q; \
 	vshl.u64 RT67q, rw1415q, #64 - 61; \
 	veor.u64 RT23q, RT23q, RT45q; \
 	vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
 	veor.u64 RT01q, RT23q, RT67q;
 #define vadd_RT01q(rw01q) \
 	/* w[0:1] += S(w[14:15]) */ \
 	vadd.u64 rw01q, RT01q;
 
 #define dummy(_) /*_*/
 
 #define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, interleave_op1, arg1, interleave_op2, arg2) \
 	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
 	vshr.u64 RT2, re, #14; \
 	vshl.u64 RT3, re, #64 - 14; \
 	interleave_op1(arg1); \
 	vshr.u64 RT4, re, #18; \
 	vshl.u64 RT5, re, #64 - 18; \
 	interleave_op2(arg2); \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, re, #41; \
 	vshl.u64 RT5, re, #64 - 41; \
 	vadd.u64 RT0, RT0, rw0; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, re; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, rf, rg; \
 	\
 	vadd.u64 RT1, RT1, rh; \
 	vshr.u64 RT2, ra, #28; \
 	vshl.u64 RT3, ra, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, ra, #34; \
 	vshl.u64 RT5, ra, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* h = Sum0 (a) + Maj (a, b, c); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, ra, #39; \
 	vshl.u64 RT5, ra, #64 - 39; \
 	veor.64 RT0, ra, rb; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rc, rb; \
 	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
 	veor.64 rh, RT2, RT3; \
 	\
 	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
 	vshr.u64 RT2, rd, #14; \
 	vshl.u64 RT3, rd, #64 - 14; \
 	vadd.u64 rh, rh, RT0; \
 	vshr.u64 RT4, rd, #18; \
 	vshl.u64 RT5, rd, #64 - 18; \
 	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
 	vld1.64 {RT0}, [RK]!; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rd, #41; \
 	vshl.u64 RT5, rd, #64 - 41; \
 	vadd.u64 RT0, RT0, rw1; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vmov.64 RT7, rd; \
 	veor.64 RT1, RT2, RT3; \
 	vbsl.64 RT7, re, rf; \
 	\
 	vadd.u64 RT1, RT1, rg; \
 	vshr.u64 RT2, rh, #28; \
 	vshl.u64 RT3, rh, #64 - 28; \
 	vadd.u64 RT1, RT1, RT0; \
 	vshr.u64 RT4, rh, #34; \
 	vshl.u64 RT5, rh, #64 - 34; \
 	vadd.u64 RT1, RT1, RT7; \
 	\
 	/* g = Sum0 (h) + Maj (h, a, b); */ \
 	veor.64 RT23q, RT23q, RT45q; \
 	vshr.u64 RT4, rh, #39; \
 	vshl.u64 RT5, rh, #64 - 39; \
 	veor.64 RT0, rh, ra; \
 	veor.64 RT23q, RT23q, RT45q; \
 	vbsl.64 RT0, rb, ra; \
 	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
 	veor.64 rg, RT2, RT3;
 #define vadd_rg_RT0(rg) \
 	vadd.u64 rg, rg, RT0;
 #define vadd_rg_RT1(rg) \
 	vadd.u64 rg, rg, RT1; /* g+=t1; */
 
 .align 3
 .globl _gcry_sha512_transform_armv7_neon
 .type  _gcry_sha512_transform_armv7_neon,%function;
 
 _gcry_sha512_transform_armv7_neon:
 	/* Input:
 	 *	%r0: SHA512_CONTEXT
 	 *	%r1: data
 	 *	%r2: u64 k[] constants
 	 *	%r3: nblks
 	 */
 	push {%lr};
 
 	mov %lr, #0;
 
 	/* Load context to d0-d7 */
 	vld1.64 {RA-RD}, [%r0]!;
 	vld1.64 {RE-RH}, [%r0];
 	sub %r0, #(4*8);
 
 	/* Load input to w[16], d16-d31 */
 	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
 	vld1.64 {RW0-RW3}, [%r1]!;
 	vld1.64 {RW4-RW7}, [%r1]!;
 	vld1.64 {RW8-RW11}, [%r1]!;
 	vld1.64 {RW12-RW15}, [%r1]!;
 #ifdef __ARMEL__
 	/* byteswap */
 	vrev64.8 RW01q, RW01q;
 	vrev64.8 RW23q, RW23q;
 	vrev64.8 RW45q, RW45q;
 	vrev64.8 RW67q, RW67q;
 	vrev64.8 RW89q, RW89q;
 	vrev64.8 RW1011q, RW1011q;
 	vrev64.8 RW1213q, RW1213q;
 	vrev64.8 RW1415q, RW1415q;
 #endif
 
 	/* EABI says that d8-d15 must be preserved by callee. */
 	vpush {RT0-RT7};
 
 .Loop:
 	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, dummy, _);
 	b .Lenter_rounds;
 
 .Loop_rounds:
 	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
 .Lenter_rounds:
 	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
 	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
 	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
 	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
 	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
 	add %lr, #16;
 	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
 	cmp %lr, #64;
 	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
 	bne .Loop_rounds;
 
 	subs %r3, #1;
 
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 	beq .Lhandle_tail;
 	vld1.64 {RW0-RW3}, [%r1]!;
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
 	vrev64.8 RW01q, RW01q;
 	vrev64.8 RW23q, RW23q;
 #endif
 	vld1.64 {RW4-RW7}, [%r1]!;
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 #ifdef __ARMEL__
 	vrev64.8 RW45q, RW45q;
 	vrev64.8 RW67q, RW67q;
 #endif
 	vld1.64 {RW8-RW11}, [%r1]!;
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
 	vrev64.8 RW89q, RW89q;
 	vrev64.8 RW1011q, RW1011q;
 #endif
 	vld1.64 {RW12-RW15}, [%r1]!;
 	vadd_rg_RT0(RA);
 	vadd_rg_RT1(RA);
 
 	/* Load context */
 	vld1.64 {RT0-RT3}, [%r0]!;
 	vld1.64 {RT4-RT7}, [%r0];
 	sub %r0, #(4*8);
 
 #ifdef __ARMEL__
 	vrev64.8 RW1213q, RW1213q;
 	vrev64.8 RW1415q, RW1415q;
 #endif
 
 	vadd.u64 RA, RT0;
 	vadd.u64 RB, RT1;
 	vadd.u64 RC, RT2;
 	vadd.u64 RD, RT3;
 	vadd.u64 RE, RT4;
 	vadd.u64 RF, RT5;
 	vadd.u64 RG, RT6;
 	vadd.u64 RH, RT7;
 
 	/* Store the first half of context */
 	vst1.64 {RA-RD}, [%r0]!;
 	sub RK, $(8*80);
 	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
 	mov %lr, #0;
 	sub %r0, #(4*8);
 
 	b .Loop;
 .ltorg
 
 .Lhandle_tail:
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 
 	/* Load context to d16-d23 */
 	vld1.64 {RW0-RW3}, [%r0]!;
 	vadd_rg_RT0(RA);
 	vld1.64 {RW4-RW7}, [%r0];
 	vadd_rg_RT1(RA);
 	sub %r0, #(4*8);
 
 	vadd.u64 RA, RW0;
 	vadd.u64 RB, RW1;
 	vadd.u64 RC, RW2;
 	vadd.u64 RD, RW3;
 	vadd.u64 RE, RW4;
 	vadd.u64 RF, RW5;
 	vadd.u64 RG, RW6;
 	vadd.u64 RH, RW7;
 
 	/* Store the first half of context */
 	vst1.64 {RA-RD}, [%r0]!;
 
 	/* Clear used registers */
 	/* d16-d31 */
 	veor.u64 RW01q, RW01q;
 	veor.u64 RW23q, RW23q;
 	veor.u64 RW45q, RW45q;
 	veor.u64 RW67q, RW67q;
 	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
 	veor.u64 RW89q, RW89q;
 	veor.u64 RW1011q, RW1011q;
 	veor.u64 RW1213q, RW1213q;
 	veor.u64 RW1415q, RW1415q;
 	/* d8-d15 */
 	vpop {RT0-RT7};
 	/* d0-d7 (q0-q3) */
 	veor.u64 %q0, %q0;
 	veor.u64 %q1, %q1;
 	veor.u64 %q2, %q2;
 	veor.u64 %q3, %q3;
 
+	eor %r0, %r0;
 	pop {%pc};
 .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;
 
 #endif
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 9405de80..721f3405 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -1,991 +1,951 @@
 /* sha512.c - SHA384 and SHA512 hash functions
  * Copyright (C) 2003, 2008, 2009 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 
 /*  Test vectors from FIPS-180-2:
  *
  *  "abc"
  * 384:
  *  CB00753F 45A35E8B B5A03D69 9AC65007 272C32AB 0EDED163
  *  1A8B605A 43FF5BED 8086072B A1E7CC23 58BAECA1 34C825A7
  * 512:
  *  DDAF35A1 93617ABA CC417349 AE204131 12E6FA4E 89A97EA2 0A9EEEE6 4B55D39A
  *  2192992A 274FC1A8 36BA3C23 A3FEEBBD 454D4423 643CE80E 2A9AC94F A54CA49F
  *
  *  "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu"
  * 384:
  *  09330C33 F71147E8 3D192FC7 82CD1B47 53111B17 3B3B05D2
  *  2FA08086 E3B0F712 FCC7C71A 557E2DB9 66C3E9FA 91746039
  * 512:
  *  8E959B75 DAE313DA 8CF4F728 14FC143F 8F7779C6 EB9F7FA1 7299AEAD B6889018
  *  501D289E 4900F7E4 331B99DE C4B5433A C7D329EE B6DD2654 5E96E55B 874BE909
  *
  *  "a" x 1000000
  * 384:
  *  9D0E1809 716474CB 086E834E 310A4A1C ED149E9C 00F24852
  *  7972CEC5 704C2A5B 07B8B3DC 38ECC4EB AE97DDD8 7F3D8985
  * 512:
  *  E718483D 0CE76964 4E2E42C7 BC15B463 8E1F98B1 3B204428 5632A803 AFA973EB
  *  DE0FF244 877EA60A 4CB0432C E577C31B EB009C5C 2C49AA2E 4EADB217 AD8CC09B
  */
 
 
 #include <config.h>
 #include <string.h>
 #include "g10lib.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
 
 /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
 #undef USE_ARM_NEON_ASM
 #ifdef ENABLE_NEON_SUPPORT
 # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
      && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
      && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_ARM_NEON_ASM 1
 # endif
 #endif /*ENABLE_NEON_SUPPORT*/
 
 
 /* USE_ARM_ASM indicates whether to enable ARM assembly code. */
 #undef USE_ARM_ASM
 #if defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
 # define USE_ARM_ASM 1
 #endif
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSSE3 1
 #endif
 
 
 /* USE_AVX indicates whether to compile with Intel AVX code. */
 #undef USE_AVX
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX 1
 #endif
 
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
     defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AVX2 1
 #endif
 
 
 typedef struct
 {
   u64 h0, h1, h2, h3, h4, h5, h6, h7;
 } SHA512_STATE;
 
 typedef struct
 {
   gcry_md_block_ctx_t bctx;
   SHA512_STATE state;
+} SHA512_CONTEXT;
+
+
+static const u64 k[] =
+  {
+    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
+    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
+    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
+    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
+    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
+    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
+    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
+    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
+    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
+    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
+    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
+    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
+    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
+    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
+    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
+    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
+    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
+    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
+    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
+    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
+    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
+    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
+    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
+    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
+    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
+    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
+    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
+    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
+    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
+    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
+    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
+    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
+    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
+    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
+    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
+    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
+    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
+    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
+    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
+    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
+  };
+
+
+/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16 + 4 * sizeof(void *))
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
 #ifdef USE_ARM_NEON_ASM
-  unsigned int use_neon:1;
+unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
+                                                const unsigned char *data,
+                                                const u64 k[], size_t num_blks);
+
+static unsigned int
+do_sha512_transform_armv7_neon(void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_armv7_neon (&hd->state, data, k, nblks);
+}
 #endif
+
 #ifdef USE_SSSE3
-  unsigned int use_ssse3:1;
+unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
+                                                void *state,
+                                                size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha512_transform_amd64_ssse3(void *ctx, const unsigned char *data,
+                                size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_amd64_ssse3 (data, &hd->state, nblks)
+         + ASM_EXTRA_STACK;
+}
 #endif
+
 #ifdef USE_AVX
-  unsigned int use_avx:1;
+unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data,
+                                              void *state,
+                                              size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha512_transform_amd64_avx(void *ctx, const unsigned char *data,
+                              size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_amd64_avx (data, &hd->state, nblks)
+         + ASM_EXTRA_STACK;
+}
 #endif
+
 #ifdef USE_AVX2
-  unsigned int use_avx2:1;
+unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data,
+                                               void *state,
+                                               size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha512_transform_amd64_avx2(void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_amd64_avx2 (data, &hd->state, nblks)
+         + ASM_EXTRA_STACK;
+}
 #endif
-} SHA512_CONTEXT;
+
+
+#ifdef USE_ARM_ASM
+unsigned int _gcry_sha512_transform_arm (SHA512_STATE *hd,
+					 const unsigned char *data,
+					 const u64 k[], size_t num_blks);
 
 static unsigned int
-transform (void *context, const unsigned char *data, size_t nblks);
+do_transform_generic (void *context, const unsigned char *data, size_t nblks)
+{
+  SHA512_CONTEXT *hd = context;
+  return _gcry_sha512_transform_armv7_neon (&hd->state, data, k, nblks);
+}
+#else
+static unsigned int
+do_transform_generic (void *context, const unsigned char *data, size_t nblks);
+#endif
+
 
 static void
 sha512_init (void *context, unsigned int flags)
 {
   SHA512_CONTEXT *ctx = context;
   SHA512_STATE *hd = &ctx->state;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
+  (void)k;
 
   hd->h0 = U64_C(0x6a09e667f3bcc908);
   hd->h1 = U64_C(0xbb67ae8584caa73b);
   hd->h2 = U64_C(0x3c6ef372fe94f82b);
   hd->h3 = U64_C(0xa54ff53a5f1d36f1);
   hd->h4 = U64_C(0x510e527fade682d1);
   hd->h5 = U64_C(0x9b05688c2b3e6c1f);
   hd->h6 = U64_C(0x1f83d9abfb41bd6b);
   hd->h7 = U64_C(0x5be0cd19137e2179);
 
   ctx->bctx.nblocks = 0;
   ctx->bctx.nblocks_high = 0;
   ctx->bctx.count = 0;
   ctx->bctx.blocksize = 128;
-  ctx->bctx.bwrite = transform;
 
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  ctx->bctx.bwrite = do_transform_generic;
 #ifdef USE_ARM_NEON_ASM
-  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
+  if ((features & HWF_ARM_NEON) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
 #endif
 #ifdef USE_SSSE3
-  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
 #endif
 #ifdef USE_AVX
-  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    ctx->bctx.bwrite = do_sha512_transform_amd64_avx;
 #endif
 #ifdef USE_AVX2
-  ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
+    ctx->bctx.bwrite = do_sha512_transform_amd64_avx2;
 #endif
-
   (void)features;
 }
 
 static void
 sha384_init (void *context, unsigned int flags)
 {
   SHA512_CONTEXT *ctx = context;
   SHA512_STATE *hd = &ctx->state;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = U64_C(0xcbbb9d5dc1059ed8);
   hd->h1 = U64_C(0x629a292a367cd507);
   hd->h2 = U64_C(0x9159015a3070dd17);
   hd->h3 = U64_C(0x152fecd8f70e5939);
   hd->h4 = U64_C(0x67332667ffc00b31);
   hd->h5 = U64_C(0x8eb44a8768581511);
   hd->h6 = U64_C(0xdb0c2e0d64f98fa7);
   hd->h7 = U64_C(0x47b5481dbefa4fa4);
 
   ctx->bctx.nblocks = 0;
   ctx->bctx.nblocks_high = 0;
   ctx->bctx.count = 0;
   ctx->bctx.blocksize = 128;
-  ctx->bctx.bwrite = transform;
 
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  ctx->bctx.bwrite = do_transform_generic;
 #ifdef USE_ARM_NEON_ASM
-  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
+  if ((features & HWF_ARM_NEON) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
 #endif
 #ifdef USE_SSSE3
-  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
 #endif
 #ifdef USE_AVX
-  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    ctx->bctx.bwrite = do_sha512_transform_amd64_avx;
 #endif
 #ifdef USE_AVX2
-  ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
+    ctx->bctx.bwrite = do_sha512_transform_amd64_avx2;
 #endif
-
   (void)features;
 }
 
 
-static const u64 k[] =
-  {
-    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
-    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
-    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
-    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
-    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
-    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
-    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
-    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
-    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
-    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
-    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
-    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
-    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
-    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
-    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
-    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
-    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
-    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
-    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
-    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
-    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
-    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
-    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
-    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
-    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
-    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
-    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
-    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
-    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
-    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
-    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
-    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
-    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
-    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
-    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
-    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
-    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
-    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
-    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
-    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
-  };
-
 #ifndef USE_ARM_ASM
 
 static inline u64
 ROTR (u64 x, u64 n)
 {
   return ((x >> n) | (x << (64 - n)));
 }
 
 static inline u64
 Ch (u64 x, u64 y, u64 z)
 {
   return ((x & y) ^ ( ~x & z));
 }
 
 static inline u64
 Maj (u64 x, u64 y, u64 z)
 {
   return ((x & y) ^ (x & z) ^ (y & z));
 }
 
 static inline u64
 Sum0 (u64 x)
 {
   return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
 }
 
 static inline u64
 Sum1 (u64 x)
 {
   return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
 }
 
 /****************
  * Transform the message W which consists of 16 64-bit-words
  */
 static unsigned int
-transform_blk (SHA512_STATE *hd, const unsigned char *data)
-{
-  u64 a, b, c, d, e, f, g, h;
-  u64 w[16];
-  int t;
-
-  /* get values from the chaining vars */
-  a = hd->h0;
-  b = hd->h1;
-  c = hd->h2;
-  d = hd->h3;
-  e = hd->h4;
-  f = hd->h5;
-  g = hd->h6;
-  h = hd->h7;
-
-  for ( t = 0; t < 16; t++ )
-    w[t] = buf_get_be64(data + t * 8);
-
-#define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
-#define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-
-  for (t = 0; t < 80 - 16; )
-    {
-      u64 t1, t2;
-
-      /* Performance on a AMD Athlon(tm) Dual Core Processor 4050e
-         with gcc 4.3.3 using gcry_md_hash_buffer of each 10000 bytes
-         initialized to 0,1,2,3...255,0,... and 1000 iterations:
-
-         Not unrolled with macros:  440ms
-         Unrolled with macros:      350ms
-         Unrolled with inline:      330ms
-      */
-#if 0 /* Not unrolled.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
-      w[t%16] += S1 (w[(t - 2)%16]) + w[(t - 7)%16] + S0 (w[(t - 15)%16]);
-      t2 = Sum0 (a) + Maj (a, b, c);
-      h = g;
-      g = f;
-      f = e;
-      e = d + t1;
-      d = c;
-      c = b;
-      b = a;
-      a = t1 + t2;
-      t++;
-#else /* Unrolled to interweave the chain variables.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
-      w[0] += S1 (w[14]) + w[9] + S0 (w[1]);
-      t2 = Sum0 (a) + Maj (a, b, c);
-      d += t1;
-      h = t1 + t2;
-
-      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
-      w[1] += S1 (w[15]) + w[10] + S0 (w[2]);
-      t2 = Sum0 (h) + Maj (h, a, b);
-      c += t1;
-      g  = t1 + t2;
-
-      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
-      w[2] += S1 (w[0]) + w[11] + S0 (w[3]);
-      t2 = Sum0 (g) + Maj (g, h, a);
-      b += t1;
-      f  = t1 + t2;
-
-      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
-      w[3] += S1 (w[1]) + w[12] + S0 (w[4]);
-      t2 = Sum0 (f) + Maj (f, g, h);
-      a += t1;
-      e  = t1 + t2;
-
-      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
-      w[4] += S1 (w[2]) + w[13] + S0 (w[5]);
-      t2 = Sum0 (e) + Maj (e, f, g);
-      h += t1;
-      d  = t1 + t2;
-
-      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
-      w[5] += S1 (w[3]) + w[14] + S0 (w[6]);
-      t2 = Sum0 (d) + Maj (d, e, f);
-      g += t1;
-      c  = t1 + t2;
-
-      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
-      w[6] += S1 (w[4]) + w[15] + S0 (w[7]);
-      t2 = Sum0 (c) + Maj (c, d, e);
-      f += t1;
-      b  = t1 + t2;
-
-      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
-      w[7] += S1 (w[5]) + w[0] + S0 (w[8]);
-      t2 = Sum0 (b) + Maj (b, c, d);
-      e += t1;
-      a  = t1 + t2;
-
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
-      w[8] += S1 (w[6]) + w[1] + S0 (w[9]);
-      t2 = Sum0 (a) + Maj (a, b, c);
-      d += t1;
-      h  = t1 + t2;
-
-      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
-      w[9] += S1 (w[7]) + w[2] + S0 (w[10]);
-      t2 = Sum0 (h) + Maj (h, a, b);
-      c += t1;
-      g  = t1 + t2;
-
-      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
-      w[10] += S1 (w[8]) + w[3] + S0 (w[11]);
-      t2 = Sum0 (g) + Maj (g, h, a);
-      b += t1;
-      f  = t1 + t2;
-
-      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
-      w[11] += S1 (w[9]) + w[4] + S0 (w[12]);
-      t2 = Sum0 (f) + Maj (f, g, h);
-      a += t1;
-      e  = t1 + t2;
-
-      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
-      w[12] += S1 (w[10]) + w[5] + S0 (w[13]);
-      t2 = Sum0 (e) + Maj (e, f, g);
-      h += t1;
-      d  = t1 + t2;
-
-      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
-      w[13] += S1 (w[11]) + w[6] + S0 (w[14]);
-      t2 = Sum0 (d) + Maj (d, e, f);
-      g += t1;
-      c  = t1 + t2;
-
-      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
-      w[14] += S1 (w[12]) + w[7] + S0 (w[15]);
-      t2 = Sum0 (c) + Maj (c, d, e);
-      f += t1;
-      b  = t1 + t2;
-
-      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
-      w[15] += S1 (w[13]) + w[8] + S0 (w[0]);
-      t2 = Sum0 (b) + Maj (b, c, d);
-      e += t1;
-      a  = t1 + t2;
-
-      t += 16;
-#endif
-    }
-
-  for (; t < 80; )
-    {
-      u64 t1, t2;
-
-#if 0 /* Not unrolled.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
-      t2 = Sum0 (a) + Maj (a, b, c);
-      h = g;
-      g = f;
-      f = e;
-      e = d + t1;
-      d = c;
-      c = b;
-      b = a;
-      a = t1 + t2;
-      t++;
-#else /* Unrolled to interweave the chain variables.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
-      t2 = Sum0 (a) + Maj (a, b, c);
-      d += t1;
-      h  = t1 + t2;
-
-      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
-      t2 = Sum0 (h) + Maj (h, a, b);
-      c += t1;
-      g  = t1 + t2;
-
-      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
-      t2 = Sum0 (g) + Maj (g, h, a);
-      b += t1;
-      f  = t1 + t2;
-
-      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
-      t2 = Sum0 (f) + Maj (f, g, h);
-      a += t1;
-      e  = t1 + t2;
-
-      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
-      t2 = Sum0 (e) + Maj (e, f, g);
-      h += t1;
-      d  = t1 + t2;
-
-      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
-      t2 = Sum0 (d) + Maj (d, e, f);
-      g += t1;
-      c  = t1 + t2;
-
-      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
-      t2 = Sum0 (c) + Maj (c, d, e);
-      f += t1;
-      b  = t1 + t2;
-
-      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
-      t2 = Sum0 (b) + Maj (b, c, d);
-      e += t1;
-      a  = t1 + t2;
-
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
-      t2 = Sum0 (a) + Maj (a, b, c);
-      d += t1;
-      h  = t1 + t2;
-
-      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
-      t2 = Sum0 (h) + Maj (h, a, b);
-      c += t1;
-      g  = t1 + t2;
-
-      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
-      t2 = Sum0 (g) + Maj (g, h, a);
-      b += t1;
-      f  = t1 + t2;
-
-      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
-      t2 = Sum0 (f) + Maj (f, g, h);
-      a += t1;
-      e  = t1 + t2;
-
-      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
-      t2 = Sum0 (e) + Maj (e, f, g);
-      h += t1;
-      d  = t1 + t2;
-
-      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
-      t2 = Sum0 (d) + Maj (d, e, f);
-      g += t1;
-      c  = t1 + t2;
-
-      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
-      t2 = Sum0 (c) + Maj (c, d, e);
-      f += t1;
-      b  = t1 + t2;
-
-      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
-      t2 = Sum0 (b) + Maj (b, c, d);
-      e += t1;
-      a  = t1 + t2;
-
-      t += 16;
-#endif
-    }
-
-  /* Update chaining vars.  */
-  hd->h0 += a;
-  hd->h1 += b;
-  hd->h2 += c;
-  hd->h3 += d;
-  hd->h4 += e;
-  hd->h5 += f;
-  hd->h6 += g;
-  hd->h7 += h;
-
-  return /* burn_stack */ (8 + 16) * sizeof(u64) + sizeof(u32) +
-                          3 * sizeof(void*);
-}
-#endif /*!USE_ARM_ASM*/
-
-/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
- * stack to store XMM6-XMM15 needed on Win64. */
-#undef ASM_FUNC_ABI
-#undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2)
-# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-#  define ASM_FUNC_ABI __attribute__((sysv_abi))
-#  define ASM_EXTRA_STACK (10 * 16)
-# else
-#  define ASM_FUNC_ABI
-#  define ASM_EXTRA_STACK 0
-# endif
-#endif
-
-
-#ifdef USE_ARM_NEON_ASM
-void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
-					const unsigned char *data,
-					const u64 k[], size_t num_blks);
-#endif
-
-#ifdef USE_ARM_ASM
-unsigned int _gcry_sha512_transform_arm (SHA512_STATE *hd,
-					 const unsigned char *data,
-					 const u64 k[], size_t num_blks);
-#endif
-
-#ifdef USE_SSSE3
-unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
-                                                void *state,
-                                                size_t num_blks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_AVX
-unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data,
-                                              void *state,
-                                              size_t num_blks) ASM_FUNC_ABI;
-#endif
-
-#ifdef USE_AVX2
-unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data,
-                                               void *state,
-                                               size_t num_blks) ASM_FUNC_ABI;
-#endif
-
-
-static unsigned int
-transform (void *context, const unsigned char *data, size_t nblks)
+do_transform_generic (void *context, const unsigned char *data, size_t nblks)
 {
   SHA512_CONTEXT *ctx = context;
-  unsigned int burn;
-
-#ifdef USE_AVX2
-  if (ctx->use_avx2)
-    return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
-#endif
-
-#ifdef USE_AVX
-  if (ctx->use_avx)
-    return _gcry_sha512_transform_amd64_avx (data, &ctx->state, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
-#endif
-
-#ifdef USE_SSSE3
-  if (ctx->use_ssse3)
-    return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
-#endif
+  SHA512_STATE *hd = &ctx->state;
 
-#ifdef USE_ARM_NEON_ASM
-  if (ctx->use_neon)
+  do
     {
-      _gcry_sha512_transform_armv7_neon (&ctx->state, data, k, nblks);
+      u64 a, b, c, d, e, f, g, h;
+      u64 w[16];
+      int t;
+
+      /* get values from the chaining vars */
+      a = hd->h0;
+      b = hd->h1;
+      c = hd->h2;
+      d = hd->h3;
+      e = hd->h4;
+      f = hd->h5;
+      g = hd->h6;
+      h = hd->h7;
+
+      for ( t = 0; t < 16; t++ )
+        w[t] = buf_get_be64(data + t * 8);
 
-      /* _gcry_sha512_transform_armv7_neon does not store sensitive data
-       * to stack.  */
-      return /* no burn_stack */ 0;
-    }
-#endif
+#define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+#define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+
+      for (t = 0; t < 80 - 16; )
+        {
+          u64 t1, t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+          w[0] += S1 (w[14]) + w[9] + S0 (w[1]);
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+          w[1] += S1 (w[15]) + w[10] + S0 (w[2]);
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+          w[2] += S1 (w[0]) + w[11] + S0 (w[3]);
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+          w[3] += S1 (w[1]) + w[12] + S0 (w[4]);
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+          w[4] += S1 (w[2]) + w[13] + S0 (w[5]);
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+          w[5] += S1 (w[3]) + w[14] + S0 (w[6]);
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+          w[6] += S1 (w[4]) + w[15] + S0 (w[7]);
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+          w[7] += S1 (w[5]) + w[0] + S0 (w[8]);
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
+          w[8] += S1 (w[6]) + w[1] + S0 (w[9]);
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h  = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
+          w[9] += S1 (w[7]) + w[2] + S0 (w[10]);
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
+          w[10] += S1 (w[8]) + w[3] + S0 (w[11]);
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
+          w[11] += S1 (w[9]) + w[4] + S0 (w[12]);
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
+          w[12] += S1 (w[10]) + w[5] + S0 (w[13]);
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
+          w[13] += S1 (w[11]) + w[6] + S0 (w[14]);
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
+          w[14] += S1 (w[12]) + w[7] + S0 (w[15]);
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
+          w[15] += S1 (w[13]) + w[8] + S0 (w[0]);
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t += 16;
+        }
+
+      for (; t < 80; )
+        {
+          u64 t1, t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h  = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h  = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t += 16;
+        }
+
+      /* Update chaining vars.  */
+      hd->h0 += a;
+      hd->h1 += b;
+      hd->h2 += c;
+      hd->h3 += d;
+      hd->h4 += e;
+      hd->h5 += f;
+      hd->h6 += g;
+      hd->h7 += h;
 
-#ifdef USE_ARM_ASM
-  burn = _gcry_sha512_transform_arm (&ctx->state, data, k, nblks);
-#else
-  do
-    {
-      burn = transform_blk (&ctx->state, data) + 3 * sizeof(void*);
       data += 128;
     }
   while (--nblks);
 
-#ifdef ASM_EXTRA_STACK
-  /* 'transform_blk' is typically inlined and XMM6-XMM15 are stored at
-   *  the prologue of this function. Therefore need to add ASM_EXTRA_STACK to
-   *  here too.
-   */
-  burn += ASM_EXTRA_STACK;
-#endif
-#endif
-
-  return burn;
+  return (8 + 16) * sizeof(u64) + sizeof(u32) + 3 * sizeof(void*);
 }
+#endif /*!USE_ARM_ASM*/
 
 
 /* The routine final terminates the computation and
  * returns the digest.
  * The handle is prepared for a new cycle, but adding bytes to the
  * handle will the destroy the returned buffer.
  * Returns: 64 bytes representing the digest.  When used for sha384,
  * we take the leftmost 48 of those bytes.
  */
 
 static void
 sha512_final (void *context)
 {
   SHA512_CONTEXT *hd = context;
   unsigned int stack_burn_depth;
   u64 t, th, msb, lsb;
   byte *p;
 
   _gcry_md_block_write (context, NULL, 0); /* flush */ ;
 
   t = hd->bctx.nblocks;
   /* if (sizeof t == sizeof hd->bctx.nblocks) */
   th = hd->bctx.nblocks_high;
   /* else */
   /*   th = hd->bctx.nblocks >> 64; In case we ever use u128  */
 
   /* multiply by 128 to make a byte count */
   lsb = t << 7;
   msb = (th << 7) | (t >> 57);
   /* add the count */
   t = lsb;
   if ((lsb += hd->bctx.count) < t)
     msb++;
   /* multiply by 8 to make a bit count */
   t = lsb;
   lsb <<= 3;
   msb <<= 3;
   msb |= t >> 61;
 
   if (hd->bctx.count < 112)
     {				/* enough room */
       hd->bctx.buf[hd->bctx.count++] = 0x80;	/* pad */
       while (hd->bctx.count < 112)
         hd->bctx.buf[hd->bctx.count++] = 0;	/* pad */
     }
   else
     {				/* need one extra block */
       hd->bctx.buf[hd->bctx.count++] = 0x80;	/* pad character */
       while (hd->bctx.count < 128)
         hd->bctx.buf[hd->bctx.count++] = 0;
       _gcry_md_block_write (context, NULL, 0); /* flush */ ;
       memset (hd->bctx.buf, 0, 112);	/* fill next block with zeroes */
     }
   /* append the 128 bit count */
   buf_put_be64(hd->bctx.buf + 112, msb);
   buf_put_be64(hd->bctx.buf + 120, lsb);
-  stack_burn_depth = transform (hd, hd->bctx.buf, 1);
+  stack_burn_depth = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1);
   _gcry_burn_stack (stack_burn_depth);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be64(p, hd->state.h##a); p += 8; } while (0)
   X (0);
   X (1);
   X (2);
   X (3);
   X (4);
   X (5);
   /* Note that these last two chunks are included even for SHA384.
      We just ignore them. */
   X (6);
   X (7);
 #undef X
 }
 
 static byte *
 sha512_read (void *context)
 {
   SHA512_CONTEXT *hd = (SHA512_CONTEXT *) context;
   return hd->bctx.buf;
 }
 
 
 /* Shortcut functions which puts the hash value of the supplied buffer
  * into outbuf which must have a size of 64 bytes.  */
 void
 _gcry_sha512_hash_buffer (void *outbuf, const void *buffer, size_t length)
 {
   SHA512_CONTEXT hd;
 
   sha512_init (&hd, 0);
   _gcry_md_block_write (&hd, buffer, length);
   sha512_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 64);
 }
 
 
 /* Variant of the above shortcut function using multiple buffers.  */
 void
 _gcry_sha512_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 {
   SHA512_CONTEXT hd;
 
   sha512_init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     _gcry_md_block_write (&hd,
                           (const char*)iov[0].data + iov[0].off, iov[0].len);
   sha512_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 64);
 }
 
 
 
 /* Shortcut functions which puts the hash value of the supplied buffer
  * into outbuf which must have a size of 48 bytes.  */
 static void
 _gcry_sha384_hash_buffer (void *outbuf, const void *buffer, size_t length)
 {
   SHA512_CONTEXT hd;
 
   sha384_init (&hd, 0);
   _gcry_md_block_write (&hd, buffer, length);
   sha512_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 48);
 }
 
 
 /* Variant of the above shortcut function using multiple buffers.  */
 static void
 _gcry_sha384_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 {
   SHA512_CONTEXT hd;
 
   sha384_init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     _gcry_md_block_write (&hd,
                           (const char*)iov[0].data + iov[0].off, iov[0].len);
   sha512_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 48);
 }
 
 
 
 /*
      Self-test section.
  */
 
 
 static gpg_err_code_t
 selftests_sha384 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA384, 0,
      "abc", 3,
      "\xcb\x00\x75\x3f\x45\xa3\x5e\x8b\xb5\xa0\x3d\x69\x9a\xc6\x50\x07"
      "\x27\x2c\x32\xab\x0e\xde\xd1\x63\x1a\x8b\x60\x5a\x43\xff\x5b\xed"
      "\x80\x86\x07\x2b\xa1\xe7\xcc\x23\x58\xba\xec\xa1\x34\xc8\x25\xa7", 48);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA384, 0,
          "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
          "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
          "\x09\x33\x0C\x33\xF7\x11\x47\xE8\x3D\x19\x2F\xC7\x82\xCD\x1B\x47"
          "\x53\x11\x1B\x17\x3B\x3B\x05\xD2\x2F\xA0\x80\x86\xE3\xB0\xF7\x12"
          "\xFC\xC7\xC7\x1A\x55\x7E\x2D\xB9\x66\xC3\xE9\xFA\x91\x74\x60\x39",
          48);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA384, 1,
          NULL, 0,
          "\x9D\x0E\x18\x09\x71\x64\x74\xCB\x08\x6E\x83\x4E\x31\x0A\x4A\x1C"
          "\xED\x14\x9E\x9C\x00\xF2\x48\x52\x79\x72\xCE\xC5\x70\x4C\x2A\x5B"
          "\x07\xB8\xB3\xDC\x38\xEC\xC4\xEB\xAE\x97\xDD\xD8\x7F\x3D\x89\x85",
          48);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA384, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 static gpg_err_code_t
 selftests_sha512 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA512, 0,
      "abc", 3,
      "\xDD\xAF\x35\xA1\x93\x61\x7A\xBA\xCC\x41\x73\x49\xAE\x20\x41\x31"
      "\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A"
      "\x21\x92\x99\x2A\x27\x4F\xC1\xA8\x36\xBA\x3C\x23\xA3\xFE\xEB\xBD"
      "\x45\x4D\x44\x23\x64\x3C\xE8\x0E\x2A\x9A\xC9\x4F\xA5\x4C\xA4\x9F", 64);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA512, 0,
          "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
          "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
          "\x8E\x95\x9B\x75\xDA\xE3\x13\xDA\x8C\xF4\xF7\x28\x14\xFC\x14\x3F"
          "\x8F\x77\x79\xC6\xEB\x9F\x7F\xA1\x72\x99\xAE\xAD\xB6\x88\x90\x18"
          "\x50\x1D\x28\x9E\x49\x00\xF7\xE4\x33\x1B\x99\xDE\xC4\xB5\x43\x3A"
          "\xC7\xD3\x29\xEE\xB6\xDD\x26\x54\x5E\x96\xE5\x5B\x87\x4B\xE9\x09",
          64);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA512, 1,
          NULL, 0,
          "\xE7\x18\x48\x3D\x0C\xE7\x69\x64\x4E\x2E\x42\xC7\xBC\x15\xB4\x63"
          "\x8E\x1F\x98\xB1\x3B\x20\x44\x28\x56\x32\xA8\x03\xAF\xA9\x73\xEB"
          "\xDE\x0F\xF2\x44\x87\x7E\xA6\x0A\x4C\xB0\x43\x2C\xE5\x77\xC3\x1B"
          "\xEB\x00\x9C\x5C\x2C\x49\xAA\x2E\x4E\xAD\xB2\x17\xAD\x8C\xC0\x9B",
          64);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA512, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
   gpg_err_code_t ec;
 
   switch (algo)
     {
     case GCRY_MD_SHA384:
       ec = selftests_sha384 (extended, report);
       break;
     case GCRY_MD_SHA512:
       ec = selftests_sha512 (extended, report);
       break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
 
     }
   return ec;
 }
 
 
 
 
 static byte sha512_asn[] =	/* Object ID is 2.16.840.1.101.3.4.2.3 */
   {
     0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
     0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, 0x05,
     0x00, 0x04, 0x40
   };
 
 static gcry_md_oid_spec_t oid_spec_sha512[] =
   {
     { "2.16.840.1.101.3.4.2.3" },
 
     /* PKCS#1 sha512WithRSAEncryption */
     { "1.2.840.113549.1.1.13" },
 
     { NULL }
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha512 =
   {
     GCRY_MD_SHA512, {0, 1},
     "SHA512", sha512_asn, DIM (sha512_asn), oid_spec_sha512, 64,
     sha512_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
     _gcry_sha512_hash_buffer, _gcry_sha512_hash_buffers,
     sizeof (SHA512_CONTEXT),
     run_selftests
   };
 
 static byte sha384_asn[] =	/* Object ID is 2.16.840.1.101.3.4.2.2 */
   {
     0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
     0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, 0x05,
     0x00, 0x04, 0x30
   };
 
 static gcry_md_oid_spec_t oid_spec_sha384[] =
   {
     { "2.16.840.1.101.3.4.2.2" },
 
     /* PKCS#1 sha384WithRSAEncryption */
     { "1.2.840.113549.1.1.12" },
 
     /* SHA384WithECDSA: RFC 7427 (A.3.3.) */
     { "1.2.840.10045.4.3.3" },
 
     { NULL },
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha384 =
   {
     GCRY_MD_SHA384, {0, 1},
     "SHA384", sha384_asn, DIM (sha384_asn), oid_spec_sha384, 48,
     sha384_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
     _gcry_sha384_hash_buffer, _gcry_sha384_hash_buffers,
     sizeof (SHA512_CONTEXT),
     run_selftests
   };