diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index 72b28b03..d75fe515 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -1,576 +1,577 @@
 /* salsa20.c  -  Bernstein's Salsa20 cipher
  * Copyright (C) 2012 Simon Josefsson, Niels Möller
  * Copyright (C) 2013 g10 Code GmbH
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * For a description of the algorithm, see:
  *   http://cr.yp.to/snuffle/spec.pdf
  *   http://cr.yp.to/snuffle/design.pdf
  */
 
 /* The code is based on the code in Nettle
    (git commit id 9d2d8ddaee35b91a4e1a32ae77cba04bea3480e7)
    which in turn is based on
    salsa20-ref.c version 20051118
    D. J. Bernstein
    Public domain.
 */
 
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
 
 
 /* USE_AMD64 indicates whether to compile with AMD64 code. */
 #undef USE_AMD64
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
 # define USE_AMD64 1
 #endif
 
 /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
 #undef USE_ARM_NEON_ASM
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-     defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_ARM_NEON_ASM 1
 # endif
-#endif
+#endif /*ENABLE_NEON_SUPPORT*/
 
 
 #define SALSA20_MIN_KEY_SIZE 16  /* Bytes.  */
 #define SALSA20_MAX_KEY_SIZE 32  /* Bytes.  */
 #define SALSA20_BLOCK_SIZE   64  /* Bytes.  */
 #define SALSA20_IV_SIZE       8  /* Bytes.  */
 #define SALSA20_INPUT_LENGTH 16  /* Bytes.  */
 
 /* Number of rounds.  The standard uses 20 rounds.  In any case the
    number of rounds must be even.  */
 #define SALSA20_ROUNDS       20
 #define SALSA20R12_ROUNDS    12
 
 
 struct SALSA20_context_s;
 
 typedef unsigned int (*salsa20_core_t) (u32 *dst, struct SALSA20_context_s *ctx,
                                         unsigned int rounds);
 typedef void (* salsa20_keysetup_t)(struct SALSA20_context_s *ctx,
                                     const byte *key, int keylen);
 typedef void (* salsa20_ivsetup_t)(struct SALSA20_context_s *ctx,
                                    const byte *iv);
 
 typedef struct SALSA20_context_s
 {
   /* Indices 1-4 and 11-14 holds the key (two identical copies for the
      shorter key size), indices 0, 5, 10, 15 are constant, indices 6, 7
      are the IV, and indices 8, 9 are the block counter:
 
      C K K K
      K C I I
      B B C K
      K K K C
   */
   u32 input[SALSA20_INPUT_LENGTH];
   u32 pad[SALSA20_INPUT_LENGTH];
   unsigned int unused; /* bytes in the pad.  */
 #ifdef USE_ARM_NEON_ASM
   int use_neon;
 #endif
   salsa20_keysetup_t keysetup;
   salsa20_ivsetup_t ivsetup;
   salsa20_core_t core;
 } SALSA20_context_t;
 
 
 /* The masking of the right shift is needed to allow n == 0 (using
    just 32 - n and 64 - n results in undefined behaviour). Most uses
    of these macros use a constant and non-zero rotation count. */
 #define ROTL32(n,x) (((x)<<(n)) | ((x)>>((-(n)&31))))
 
 
 #define LE_SWAP32(v) le_bswap32(v)
 
 #define LE_READ_UINT32(p) buf_get_le32(p)
 
 
 static void salsa20_setiv (void *context, const byte *iv, size_t ivlen);
 static const char *selftest (void);
 
 
 #ifdef USE_AMD64
 /* AMD64 assembly implementations of Salsa20. */
 void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits);
 void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv);
 unsigned int
 _gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst,
                                    size_t len, int rounds);
 
 static void
 salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
 {
   _gcry_salsa20_amd64_keysetup(ctx->input, key, keylen * 8);
 }
 
 static void
 salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
 {
   _gcry_salsa20_amd64_ivsetup(ctx->input, iv);
 }
 
 static unsigned int
 salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
 {
   memset(dst, 0, SALSA20_BLOCK_SIZE);
   return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds);
 }
 
 #else /* USE_AMD64 */
 
 
 
 #if 0
 # define SALSA20_CORE_DEBUG(i) do {		\
     unsigned debug_j;				\
     for (debug_j = 0; debug_j < 16; debug_j++)	\
       {						\
 	if (debug_j == 0)			\
 	  fprintf(stderr, "%2d:", (i));		\
 	else if (debug_j % 4 == 0)		\
 	  fprintf(stderr, "\n   ");		\
 	fprintf(stderr, " %8x", pad[debug_j]);	\
       }						\
     fprintf(stderr, "\n");			\
   } while (0)
 #else
 # define SALSA20_CORE_DEBUG(i)
 #endif
 
 #define QROUND(x0, x1, x2, x3)      \
   do {                              \
     x1 ^= ROTL32 ( 7, x0 + x3);	    \
     x2 ^= ROTL32 ( 9, x1 + x0);	    \
     x3 ^= ROTL32 (13, x2 + x1);	    \
     x0 ^= ROTL32 (18, x3 + x2);	    \
   } while(0)
 
 static unsigned int
 salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned rounds)
 {
   u32 pad[SALSA20_INPUT_LENGTH], *src = ctx->input;
   unsigned int i;
 
   memcpy (pad, src, sizeof(pad));
   for (i = 0; i < rounds; i += 2)
     {
       SALSA20_CORE_DEBUG (i);
       QROUND (pad[0],  pad[4],  pad[8],  pad[12]);
       QROUND (pad[5],  pad[9],  pad[13], pad[1] );
       QROUND (pad[10], pad[14], pad[2],  pad[6] );
       QROUND (pad[15], pad[3],  pad[7],  pad[11]);
 
       SALSA20_CORE_DEBUG (i+1);
       QROUND (pad[0],  pad[1],  pad[2],  pad[3] );
       QROUND (pad[5],  pad[6],  pad[7],  pad[4] );
       QROUND (pad[10], pad[11], pad[8],  pad[9] );
       QROUND (pad[15], pad[12], pad[13], pad[14]);
     }
   SALSA20_CORE_DEBUG (i);
 
   for (i = 0; i < SALSA20_INPUT_LENGTH; i++)
     {
       u32 t = pad[i] + src[i];
       dst[i] = LE_SWAP32 (t);
     }
 
   /* Update counter. */
   if (!++src[8])
     src[9]++;
 
   /* burn_stack */
   return ( 3*sizeof (void*) \
          + 2*sizeof (void*) \
          + 64 \
          + sizeof (unsigned int) \
          + sizeof (u32) );
 }
 #undef QROUND
 #undef SALSA20_CORE_DEBUG
 
 static void
 salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
 {
   /* These constants are the little endian encoding of the string
      "expand 32-byte k".  For the 128 bit variant, the "32" in that
      string will be fixed up to "16".  */
   ctx->input[0]  = 0x61707865; /* "apxe"  */
   ctx->input[5]  = 0x3320646e; /* "3 dn"  */
   ctx->input[10] = 0x79622d32; /* "yb-2"  */
   ctx->input[15] = 0x6b206574; /* "k et"  */
 
   ctx->input[1] = LE_READ_UINT32(key + 0);
   ctx->input[2] = LE_READ_UINT32(key + 4);
   ctx->input[3] = LE_READ_UINT32(key + 8);
   ctx->input[4] = LE_READ_UINT32(key + 12);
   if (keylen == SALSA20_MAX_KEY_SIZE) /* 256 bits */
     {
       ctx->input[11] = LE_READ_UINT32(key + 16);
       ctx->input[12] = LE_READ_UINT32(key + 20);
       ctx->input[13] = LE_READ_UINT32(key + 24);
       ctx->input[14] = LE_READ_UINT32(key + 28);
     }
   else  /* 128 bits */
     {
       ctx->input[11] = ctx->input[1];
       ctx->input[12] = ctx->input[2];
       ctx->input[13] = ctx->input[3];
       ctx->input[14] = ctx->input[4];
 
       ctx->input[5]  -= 0x02000000; /* Change to "1 dn".  */
       ctx->input[10] += 0x00000004; /* Change to "yb-6".  */
     }
 }
 
 static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
 {
   ctx->input[6] = LE_READ_UINT32(iv + 0);
   ctx->input[7] = LE_READ_UINT32(iv + 4);
   /* Reset the block counter.  */
   ctx->input[8] = 0;
   ctx->input[9] = 0;
 }
 
 #endif /*!USE_AMD64*/
 
 #ifdef USE_ARM_NEON_ASM
 
 /* ARM NEON implementation of Salsa20. */
 unsigned int
 _gcry_arm_neon_salsa20_encrypt(void *c, const void *m, unsigned int nblks,
                                void *k, unsigned int rounds);
 
 static unsigned int
 salsa20_core_neon (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
 {
   return _gcry_arm_neon_salsa20_encrypt(dst, NULL, 1, ctx->input, rounds);
 }
 
 static void salsa20_ivsetup_neon(SALSA20_context_t *ctx, const byte *iv)
 {
   memcpy(ctx->input + 8, iv, 8);
   /* Reset the block counter.  */
   memset(ctx->input + 10, 0, 8);
 }
 
 static void
 salsa20_keysetup_neon(SALSA20_context_t *ctx, const byte *key, int klen)
 {
   static const unsigned char sigma32[16] = "expand 32-byte k";
   static const unsigned char sigma16[16] = "expand 16-byte k";
 
   if (klen == 16)
     {
       memcpy (ctx->input, key, 16);
       memcpy (ctx->input + 4, key, 16); /* Duplicate 128-bit key. */
       memcpy (ctx->input + 12, sigma16, 16);
     }
   else
     {
       /* 32-byte key */
       memcpy (ctx->input, key, 32);
       memcpy (ctx->input + 12, sigma32, 16);
     }
 }
 
 #endif /*USE_ARM_NEON_ASM*/
 
 
 static gcry_err_code_t
 salsa20_do_setkey (SALSA20_context_t *ctx,
                    const byte *key, unsigned int keylen)
 {
   static int initialized;
   static const char *selftest_failed;
 
   if (!initialized )
     {
       initialized = 1;
       selftest_failed = selftest ();
       if (selftest_failed)
         log_error ("SALSA20 selftest failed (%s)\n", selftest_failed );
     }
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
   if (keylen != SALSA20_MIN_KEY_SIZE
       && keylen != SALSA20_MAX_KEY_SIZE)
     return GPG_ERR_INV_KEYLEN;
 
   /* Default ops. */
   ctx->keysetup = salsa20_keysetup;
   ctx->ivsetup = salsa20_ivsetup;
   ctx->core = salsa20_core;
 
 #ifdef USE_ARM_NEON_ASM
   ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
   if (ctx->use_neon)
     {
       /* Use ARM NEON ops instead. */
       ctx->keysetup = salsa20_keysetup_neon;
       ctx->ivsetup = salsa20_ivsetup_neon;
       ctx->core = salsa20_core_neon;
     }
 #endif
 
   ctx->keysetup (ctx, key, keylen);
 
   /* We default to a zero nonce.  */
   salsa20_setiv (ctx, NULL, 0);
 
   return 0;
 }
 
 
 static gcry_err_code_t
 salsa20_setkey (void *context, const byte *key, unsigned int keylen)
 {
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
   gcry_err_code_t rc = salsa20_do_setkey (ctx, key, keylen);
   _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
   return rc;
 }
 
 
 static void
 salsa20_setiv (void *context, const byte *iv, size_t ivlen)
 {
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
   byte tmp[SALSA20_IV_SIZE];
 
   if (iv && ivlen != SALSA20_IV_SIZE)
     log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", (u32)ivlen);
 
   if (!iv || ivlen != SALSA20_IV_SIZE)
     memset (tmp, 0, sizeof(tmp));
   else
     memcpy (tmp, iv, SALSA20_IV_SIZE);
 
   ctx->ivsetup (ctx, tmp);
 
   /* Reset the unused pad bytes counter.  */
   ctx->unused = 0;
 
   wipememory (tmp, sizeof(tmp));
 }
 
 
 
 /* Note: This function requires LENGTH > 0.  */
 static void
 salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
                            byte *outbuf, const byte *inbuf,
                            size_t length, unsigned rounds)
 {
   unsigned int nburn, burn = 0;
 
   if (ctx->unused)
     {
       unsigned char *p = (void*)ctx->pad;
       size_t n;
 
       gcry_assert (ctx->unused < SALSA20_BLOCK_SIZE);
 
       n = ctx->unused;
       if (n > length)
         n = length;
       buf_xor (outbuf, inbuf, p + SALSA20_BLOCK_SIZE - ctx->unused, n);
       length -= n;
       outbuf += n;
       inbuf  += n;
       ctx->unused -= n;
       if (!length)
         return;
       gcry_assert (!ctx->unused);
     }
 
 #ifdef USE_AMD64
   if (length >= SALSA20_BLOCK_SIZE)
     {
       size_t nblocks = length / SALSA20_BLOCK_SIZE;
       burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf,
                                                 nblocks, rounds);
       length -= SALSA20_BLOCK_SIZE * nblocks;
       outbuf += SALSA20_BLOCK_SIZE * nblocks;
       inbuf  += SALSA20_BLOCK_SIZE * nblocks;
     }
 #endif
 
 #ifdef USE_ARM_NEON_ASM
   if (ctx->use_neon && length >= SALSA20_BLOCK_SIZE)
     {
       unsigned int nblocks = length / SALSA20_BLOCK_SIZE;
       _gcry_arm_neon_salsa20_encrypt (outbuf, inbuf, nblocks, ctx->input,
                                       rounds);
       length -= SALSA20_BLOCK_SIZE * nblocks;
       outbuf += SALSA20_BLOCK_SIZE * nblocks;
       inbuf  += SALSA20_BLOCK_SIZE * nblocks;
     }
 #endif
 
   while (length > 0)
     {
       /* Create the next pad and bump the block counter.  Note that it
          is the user's duty to change to another nonce not later than
          after 2^70 processed bytes.  */
       nburn = ctx->core (ctx->pad, ctx, rounds);
       burn = nburn > burn ? nburn : burn;
 
       if (length <= SALSA20_BLOCK_SIZE)
 	{
 	  buf_xor (outbuf, inbuf, ctx->pad, length);
           ctx->unused = SALSA20_BLOCK_SIZE - length;
 	  break;
 	}
       buf_xor (outbuf, inbuf, ctx->pad, SALSA20_BLOCK_SIZE);
       length -= SALSA20_BLOCK_SIZE;
       outbuf += SALSA20_BLOCK_SIZE;
       inbuf  += SALSA20_BLOCK_SIZE;
     }
 
   _gcry_burn_stack (burn);
 }
 
 
 static void
 salsa20_encrypt_stream (void *context,
                         byte *outbuf, const byte *inbuf, size_t length)
 {
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
 
   if (length)
     salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS);
 }
 
 
 static void
 salsa20r12_encrypt_stream (void *context,
                            byte *outbuf, const byte *inbuf, size_t length)
 {
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
 
   if (length)
     salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS);
 }
 
 
 static const char*
 selftest (void)
 {
   SALSA20_context_t ctx;
   byte scratch[8+1];
   byte buf[256+64+4];
   int i;
 
   static byte key_1[] =
     { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
   static const byte nonce_1[] =
     { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
   static const byte plaintext_1[] =
     { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
   static const byte ciphertext_1[] =
     { 0xE3, 0xBE, 0x8F, 0xDD, 0x8B, 0xEC, 0xA2, 0xE3};
 
   salsa20_setkey (&ctx, key_1, sizeof key_1);
   salsa20_setiv  (&ctx, nonce_1, sizeof nonce_1);
   scratch[8] = 0;
   salsa20_encrypt_stream (&ctx, scratch, plaintext_1, sizeof plaintext_1);
   if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
     return "Salsa20 encryption test 1 failed.";
   if (scratch[8])
     return "Salsa20 wrote too much.";
   salsa20_setkey( &ctx, key_1, sizeof(key_1));
   salsa20_setiv  (&ctx, nonce_1, sizeof nonce_1);
   salsa20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1);
   if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
     return "Salsa20 decryption test 1 failed.";
 
   for (i = 0; i < sizeof buf; i++)
     buf[i] = i;
   salsa20_setkey (&ctx, key_1, sizeof key_1);
   salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
   /*encrypt*/
   salsa20_encrypt_stream (&ctx, buf, buf, sizeof buf);
   /*decrypt*/
   salsa20_setkey (&ctx, key_1, sizeof key_1);
   salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
   salsa20_encrypt_stream (&ctx, buf, buf, 1);
   salsa20_encrypt_stream (&ctx, buf+1, buf+1, (sizeof buf)-1-1);
   salsa20_encrypt_stream (&ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte)i)
       return "Salsa20 encryption test 2 failed.";
 
   return NULL;
 }
 
 
 gcry_cipher_spec_t _gcry_cipher_spec_salsa20 =
   {
     GCRY_CIPHER_SALSA20,
     {0, 0},     /* flags */
     "SALSA20",  /* name */
     NULL,       /* aliases */
     NULL,       /* oids */
     1,          /* blocksize in bytes. */
     SALSA20_MAX_KEY_SIZE*8,  /* standard key length in bits. */
     sizeof (SALSA20_context_t),
     salsa20_setkey,
     NULL,
     NULL,
     salsa20_encrypt_stream,
     salsa20_encrypt_stream,
     NULL,
     NULL,
     salsa20_setiv
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_salsa20r12 =
   {
     GCRY_CIPHER_SALSA20R12,
     {0, 0},     /* flags */
     "SALSA20R12",  /* name */
     NULL,       /* aliases */
     NULL,       /* oids */
     1,          /* blocksize in bytes. */
     SALSA20_MAX_KEY_SIZE*8,  /* standard key length in bits. */
     sizeof (SALSA20_context_t),
     salsa20_setkey,
     NULL,
     NULL,
     salsa20r12_encrypt_stream,
     salsa20r12_encrypt_stream,
     NULL,
     NULL,
     salsa20_setiv
   };
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 8e647d4f..0be49da4 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1,1324 +1,1324 @@
 /* serpent.c - Implementation of the Serpent encryption algorithm.
  *	Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  * 02111-1307, USA.
  */
 
 #include <config.h>
 
 #include <string.h>
 #include <stdio.h>
 
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher-selftest.h"
 
 
 /* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
 #undef USE_SSE2
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
 # define USE_SSE2 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
 # if defined(ENABLE_AVX2_SUPPORT)
 #  define USE_AVX2 1
 # endif
 #endif
 
 /* USE_NEON indicates whether to enable ARM NEON assembly code. */
 #undef USE_NEON
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-     defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_NEON 1
 # endif
-#endif
-
+#endif /*ENABLE_NEON_SUPPORT*/
 
 /* Number of rounds per Serpent encrypt/decrypt operation.  */
 #define ROUNDS 32
 
 /* Magic number, used during generating of the subkeys.  */
 #define PHI 0x9E3779B9
 
 /* Serpent works on 128 bit blocks.  */
 typedef u32 serpent_block_t[4];
 
 /* Serpent key, provided by the user.  If the original key is shorter
    than 256 bits, it is padded.  */
 typedef u32 serpent_key_t[8];
 
 /* The key schedule consists of 33 128 bit subkeys.  */
 typedef u32 serpent_subkeys_t[ROUNDS + 1][4];
 
 /* A Serpent context.  */
 typedef struct serpent_context
 {
   serpent_subkeys_t keys;	/* Generated subkeys.  */
 
 #ifdef USE_AVX2
   int use_avx2;
 #endif
 #ifdef USE_NEON
   int use_neon;
 #endif
 } serpent_context_t;
 
 
 #ifdef USE_SSE2
 /* Assembler implementations of Serpent using SSE2.  Process 8 block in
    parallel.
  */
 extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *ctr);
 
 extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
 
 extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
 #endif
 
 #ifdef USE_AVX2
 /* Assembler implementations of Serpent using SSE2.  Process 16 block in
    parallel.
  */
 extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *ctr);
 
 extern void _gcry_serpent_avx2_cbc_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
 
 extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
 #endif
 
 #ifdef USE_NEON
 /* Assembler implementations of Serpent using ARM NEON.  Process 8 block in
    parallel.
  */
 extern void _gcry_serpent_neon_ctr_enc(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *ctr);
 
 extern void _gcry_serpent_neon_cbc_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
 
 extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
 #endif
 
 
 /* A prototype.  */
 static const char *serpent_test (void);
 
 
 /*
  * These are the S-Boxes of Serpent from following research paper.
  *
  *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
  *   (New York, New York, USA), p. 317–329, National Institute of Standards and
  *   Technology, 2000.
  *
  * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
  *
  */
 
 #define SBOX0(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r3 ^= r0; r4 =  r1; \
     r1 &= r3; r4 ^= r2; \
     r1 ^= r0; r0 |= r3; \
     r0 ^= r4; r4 ^= r3; \
     r3 ^= r2; r2 |= r1; \
     r2 ^= r4; r4 = ~r4; \
     r4 |= r1; r1 ^= r3; \
     r1 ^= r4; r3 |= r0; \
     r1 ^= r3; r4 ^= r3; \
     \
     w = r1; x = r4; y = r2; z = r0; \
   }
 
 #define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r2 = ~r2; r4 =  r1; \
     r1 |= r0; r4 = ~r4; \
     r1 ^= r2; r2 |= r4; \
     r1 ^= r3; r0 ^= r4; \
     r2 ^= r0; r0 &= r3; \
     r4 ^= r0; r0 |= r1; \
     r0 ^= r2; r3 ^= r4; \
     r2 ^= r1; r3 ^= r0; \
     r3 ^= r1; \
     r2 &= r3; \
     r4 ^= r2; \
     \
     w = r0; x = r4; y = r1; z = r3; \
   }
 
 #define SBOX1(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r0 = ~r0; r2 = ~r2; \
     r4 =  r0; r0 &= r1; \
     r2 ^= r0; r0 |= r3; \
     r3 ^= r2; r1 ^= r0; \
     r0 ^= r4; r4 |= r1; \
     r1 ^= r3; r2 |= r0; \
     r2 &= r4; r0 ^= r1; \
     r1 &= r2; \
     r1 ^= r0; r0 &= r2; \
     r0 ^= r4; \
     \
     w = r2; x = r0; y = r3; z = r1; \
   }
 
 #define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r4 =  r1; r1 ^= r3; \
     r3 &= r1; r4 ^= r2; \
     r3 ^= r0; r0 |= r1; \
     r2 ^= r3; r0 ^= r4; \
     r0 |= r2; r1 ^= r3; \
     r0 ^= r1; r1 |= r3; \
     r1 ^= r0; r4 = ~r4; \
     r4 ^= r1; r1 |= r0; \
     r1 ^= r0; \
     r1 |= r4; \
     r3 ^= r1; \
     \
     w = r4; x = r0; y = r3; z = r2; \
   }
 
 #define SBOX2(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r4 =  r0; r0 &= r2; \
     r0 ^= r3; r2 ^= r1; \
     r2 ^= r0; r3 |= r4; \
     r3 ^= r1; r4 ^= r2; \
     r1 =  r3; r3 |= r4; \
     r3 ^= r0; r0 &= r1; \
     r4 ^= r0; r1 ^= r3; \
     r1 ^= r4; r4 = ~r4; \
     \
     w = r2; x = r3; y = r1; z = r4; \
   }
 
 #define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r2 ^= r3; r3 ^= r0; \
     r4 =  r3; r3 &= r2; \
     r3 ^= r1; r1 |= r2; \
     r1 ^= r4; r4 &= r3; \
     r2 ^= r3; r4 &= r0; \
     r4 ^= r2; r2 &= r1; \
     r2 |= r0; r3 = ~r3; \
     r2 ^= r3; r0 ^= r3; \
     r0 &= r1; r3 ^= r4; \
     r3 ^= r0; \
     \
     w = r1; x = r4; y = r2; z = r3; \
   }
 
 #define SBOX3(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r4 =  r0; r0 |= r3; \
     r3 ^= r1; r1 &= r4; \
     r4 ^= r2; r2 ^= r3; \
     r3 &= r0; r4 |= r1; \
     r3 ^= r4; r0 ^= r1; \
     r4 &= r0; r1 ^= r3; \
     r4 ^= r2; r1 |= r0; \
     r1 ^= r2; r0 ^= r3; \
     r2 =  r1; r1 |= r3; \
     r1 ^= r0; \
     \
     w = r1; x = r2; y = r3; z = r4; \
   }
 
 #define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r4 =  r2; r2 ^= r1; \
     r0 ^= r2; r4 &= r2; \
     r4 ^= r0; r0 &= r1; \
     r1 ^= r3; r3 |= r4; \
     r2 ^= r3; r0 ^= r3; \
     r1 ^= r4; r3 &= r2; \
     r3 ^= r1; r1 ^= r0; \
     r1 |= r2; r0 ^= r3; \
     r1 ^= r4; \
     r0 ^= r1; \
     \
     w = r2; x = r1; y = r3; z = r0; \
   }
 
 #define SBOX4(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r1 ^= r3; r3 = ~r3; \
     r2 ^= r3; r3 ^= r0; \
     r4 =  r1; r1 &= r3; \
     r1 ^= r2; r4 ^= r3; \
     r0 ^= r4; r2 &= r4; \
     r2 ^= r0; r0 &= r1; \
     r3 ^= r0; r4 |= r1; \
     r4 ^= r0; r0 |= r3; \
     r0 ^= r2; r2 &= r3; \
     r0 = ~r0; r4 ^= r2; \
     \
     w = r1; x = r4; y = r0; z = r3; \
   }
 
 #define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r4 =  r2; r2 &= r3; \
     r2 ^= r1; r1 |= r3; \
     r1 &= r0; r4 ^= r2; \
     r4 ^= r1; r1 &= r2; \
     r0 = ~r0; r3 ^= r4; \
     r1 ^= r3; r3 &= r0; \
     r3 ^= r2; r0 ^= r1; \
     r2 &= r0; r3 ^= r0; \
     r2 ^= r4; \
     r2 |= r3; r3 ^= r0; \
     r2 ^= r1; \
     \
     w = r0; x = r3; y = r2; z = r4; \
   }
 
 #define SBOX5(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r0 ^= r1; r1 ^= r3; \
     r3 = ~r3; r4 =  r1; \
     r1 &= r0; r2 ^= r3; \
     r1 ^= r2; r2 |= r4; \
     r4 ^= r3; r3 &= r1; \
     r3 ^= r0; r4 ^= r1; \
     r4 ^= r2; r2 ^= r0; \
     r0 &= r3; r2 = ~r2; \
     r0 ^= r4; r4 |= r3; \
     r2 ^= r4; \
     \
     w = r1; x = r3; y = r0; z = r2; \
   }
 
 #define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r1 = ~r1; r4 =  r3; \
     r2 ^= r1; r3 |= r0; \
     r3 ^= r2; r2 |= r1; \
     r2 &= r0; r4 ^= r3; \
     r2 ^= r4; r4 |= r0; \
     r4 ^= r1; r1 &= r2; \
     r1 ^= r3; r4 ^= r2; \
     r3 &= r4; r4 ^= r1; \
     r3 ^= r4; r4 = ~r4; \
     r3 ^= r0; \
     \
     w = r1; x = r4; y = r3; z = r2; \
   }
 
 #define SBOX6(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r2 = ~r2; r4 =  r3; \
     r3 &= r0; r0 ^= r4; \
     r3 ^= r2; r2 |= r4; \
     r1 ^= r3; r2 ^= r0; \
     r0 |= r1; r2 ^= r1; \
     r4 ^= r0; r0 |= r3; \
     r0 ^= r2; r4 ^= r3; \
     r4 ^= r0; r3 = ~r3; \
     r2 &= r4; \
     r2 ^= r3; \
     \
     w = r0; x = r1; y = r4; z = r2; \
   }
 
 #define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r0 ^= r2; r4 =  r2; \
     r2 &= r0; r4 ^= r3; \
     r2 = ~r2; r3 ^= r1; \
     r2 ^= r3; r4 |= r0; \
     r0 ^= r2; r3 ^= r4; \
     r4 ^= r1; r1 &= r3; \
     r1 ^= r0; r0 ^= r3; \
     r0 |= r2; r3 ^= r1; \
     r4 ^= r0; \
     \
     w = r1; x = r2; y = r4; z = r3; \
   }
 
 #define SBOX7(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r4 =  r1; r1 |= r2; \
     r1 ^= r3; r4 ^= r2; \
     r2 ^= r1; r3 |= r4; \
     r3 &= r0; r4 ^= r2; \
     r3 ^= r1; r1 |= r4; \
     r1 ^= r0; r0 |= r4; \
     r0 ^= r2; r1 ^= r4; \
     r2 ^= r1; r1 &= r0; \
     r1 ^= r4; r2 = ~r2; \
     r2 |= r0; \
     r4 ^= r2; \
     \
     w = r4; x = r3; y = r1; z = r0; \
   }
 
 #define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
     u32 r4; \
     \
     r4 =  r2; r2 ^= r0; \
     r0 &= r3; r4 |= r3; \
     r2 = ~r2; r3 ^= r1; \
     r1 |= r0; r0 ^= r2; \
     r2 &= r4; r3 &= r4; \
     r1 ^= r2; r2 ^= r0; \
     r0 |= r2; r4 ^= r1; \
     r0 ^= r3; r3 ^= r4; \
     r4 |= r0; r3 ^= r2; \
     r4 ^= r2; \
     \
     w = r3; x = r0; y = r1; z = r4; \
   }
 
 /* XOR BLOCK1 into BLOCK0.  */
 #define BLOCK_XOR(block0, block1) \
   {                               \
     block0[0] ^= block1[0];       \
     block0[1] ^= block1[1];       \
     block0[2] ^= block1[2];       \
     block0[3] ^= block1[3];       \
   }
 
 /* Copy BLOCK_SRC to BLOCK_DST.  */
 #define BLOCK_COPY(block_dst, block_src) \
   {                                      \
     block_dst[0] = block_src[0];         \
     block_dst[1] = block_src[1];         \
     block_dst[2] = block_src[2];         \
     block_dst[3] = block_src[3];         \
   }
 
 /* Apply SBOX number WHICH to to the block found in ARRAY0, writing
    the output to the block found in ARRAY1.  */
 #define SBOX(which, array0, array1)                         \
   SBOX##which (array0[0], array0[1], array0[2], array0[3],  \
                array1[0], array1[1], array1[2], array1[3]);
 
 /* Apply inverse SBOX number WHICH to to the block found in ARRAY0, writing
    the output to the block found in ARRAY1.  */
 #define SBOX_INVERSE(which, array0, array1)                           \
   SBOX##which##_INVERSE (array0[0], array0[1], array0[2], array0[3],  \
                          array1[0], array1[1], array1[2], array1[3]);
 
 /* Apply the linear transformation to BLOCK.  */
 #define LINEAR_TRANSFORMATION(block)                  \
   {                                                   \
     block[0] = rol (block[0], 13);                    \
     block[2] = rol (block[2], 3);                     \
     block[1] = block[1] ^ block[0] ^ block[2];        \
     block[3] = block[3] ^ block[2] ^ (block[0] << 3); \
     block[1] = rol (block[1], 1);                     \
     block[3] = rol (block[3], 7);                     \
     block[0] = block[0] ^ block[1] ^ block[3];        \
     block[2] = block[2] ^ block[3] ^ (block[1] << 7); \
     block[0] = rol (block[0], 5);                     \
     block[2] = rol (block[2], 22);                    \
   }
 
 /* Apply the inverse linear transformation to BLOCK.  */
 #define LINEAR_TRANSFORMATION_INVERSE(block)          \
   {                                                   \
     block[2] = ror (block[2], 22);                    \
     block[0] = ror (block[0] , 5);                    \
     block[2] = block[2] ^ block[3] ^ (block[1] << 7); \
     block[0] = block[0] ^ block[1] ^ block[3];        \
     block[3] = ror (block[3], 7);                     \
     block[1] = ror (block[1], 1);                     \
     block[3] = block[3] ^ block[2] ^ (block[0] << 3); \
     block[1] = block[1] ^ block[0] ^ block[2];        \
     block[2] = ror (block[2], 3);                     \
     block[0] = ror (block[0], 13);                    \
   }
 
 /* Apply a Serpent round to BLOCK, using the SBOX number WHICH and the
    subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary storage.
    This macro increments `round'.  */
 #define ROUND(which, subkeys, block, block_tmp) \
   {                                             \
     BLOCK_XOR (block, subkeys[round]);          \
     round++;                                    \
     SBOX (which, block, block_tmp);             \
     LINEAR_TRANSFORMATION (block_tmp);          \
     BLOCK_COPY (block, block_tmp);              \
   }
 
 /* Apply the last Serpent round to BLOCK, using the SBOX number WHICH
    and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary
    storage.  The result will be stored in BLOCK_TMP.  This macro
    increments `round'.  */
 #define ROUND_LAST(which, subkeys, block, block_tmp) \
   {                                                  \
     BLOCK_XOR (block, subkeys[round]);               \
     round++;                                         \
     SBOX (which, block, block_tmp);                  \
     BLOCK_XOR (block_tmp, subkeys[round]);           \
     round++;                                         \
   }
 
 /* Apply an inverse Serpent round to BLOCK, using the SBOX number
    WHICH and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as
    temporary storage.  This macro increments `round'.  */
 #define ROUND_INVERSE(which, subkey, block, block_tmp) \
   {                                                    \
     LINEAR_TRANSFORMATION_INVERSE (block);             \
     SBOX_INVERSE (which, block, block_tmp);            \
     BLOCK_XOR (block_tmp, subkey[round]);              \
     round--;                                           \
     BLOCK_COPY (block, block_tmp);                     \
   }
 
 /* Apply the first Serpent round to BLOCK, using the SBOX number WHICH
    and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary
    storage.  The result will be stored in BLOCK_TMP.  This macro
    increments `round'.  */
 #define ROUND_FIRST_INVERSE(which, subkeys, block, block_tmp) \
   {                                                           \
     BLOCK_XOR (block, subkeys[round]);                        \
     round--;                                                  \
     SBOX_INVERSE (which, block, block_tmp);                   \
     BLOCK_XOR (block_tmp, subkeys[round]);                    \
     round--;                                                  \
   }
 
 /* Convert the user provided key KEY of KEY_LENGTH bytes into the
    internally used format.  */
 static void
 serpent_key_prepare (const byte *key, unsigned int key_length,
 		     serpent_key_t key_prepared)
 {
   int i;
 
   /* Copy key.  */
   key_length /= 4;
   for (i = 0; i < key_length; i++)
     key_prepared[i] = buf_get_le32 (key + i * 4);
 
   if (i < 8)
     {
       /* Key must be padded according to the Serpent
 	 specification.  */
       key_prepared[i] = 0x00000001;
 
       for (i++; i < 8; i++)
 	key_prepared[i] = 0;
     }
 }
 
 /* Derive the 33 subkeys from KEY and store them in SUBKEYS.  */
 static void
 serpent_subkeys_generate (serpent_key_t key, serpent_subkeys_t subkeys)
 {
   u32 w[8];		/* The `prekey'.  */
   u32 ws[4];
   u32 wt[4];
 
   /* Initialize with key values.  */
   w[0] = key[0];
   w[1] = key[1];
   w[2] = key[2];
   w[3] = key[3];
   w[4] = key[4];
   w[5] = key[5];
   w[6] = key[6];
   w[7] = key[7];
 
   /* Expand to intermediate key using the affine recurrence.  */
 #define EXPAND_KEY4(wo, r)                                                     \
   wo[0] = w[(r+0)%8] =                                                         \
     rol (w[(r+0)%8] ^ w[(r+3)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ PHI ^ (r+0), 11); \
   wo[1] = w[(r+1)%8] =                                                         \
     rol (w[(r+1)%8] ^ w[(r+4)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ PHI ^ (r+1), 11); \
   wo[2] = w[(r+2)%8] =                                                         \
     rol (w[(r+2)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ w[(r+1)%8] ^ PHI ^ (r+2), 11); \
   wo[3] = w[(r+3)%8] =                                                         \
     rol (w[(r+3)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ w[(r+2)%8] ^ PHI ^ (r+3), 11);
 
 #define EXPAND_KEY(r)       \
   EXPAND_KEY4(ws, (r));     \
   EXPAND_KEY4(wt, (r + 4));
 
   /* Calculate subkeys via S-Boxes, in bitslice mode.  */
   EXPAND_KEY (0); SBOX (3, ws, subkeys[0]); SBOX (2, wt, subkeys[1]);
   EXPAND_KEY (8); SBOX (1, ws, subkeys[2]); SBOX (0, wt, subkeys[3]);
   EXPAND_KEY (16); SBOX (7, ws, subkeys[4]); SBOX (6, wt, subkeys[5]);
   EXPAND_KEY (24); SBOX (5, ws, subkeys[6]); SBOX (4, wt, subkeys[7]);
   EXPAND_KEY (32); SBOX (3, ws, subkeys[8]); SBOX (2, wt, subkeys[9]);
   EXPAND_KEY (40); SBOX (1, ws, subkeys[10]); SBOX (0, wt, subkeys[11]);
   EXPAND_KEY (48); SBOX (7, ws, subkeys[12]); SBOX (6, wt, subkeys[13]);
   EXPAND_KEY (56); SBOX (5, ws, subkeys[14]); SBOX (4, wt, subkeys[15]);
   EXPAND_KEY (64); SBOX (3, ws, subkeys[16]); SBOX (2, wt, subkeys[17]);
   EXPAND_KEY (72); SBOX (1, ws, subkeys[18]); SBOX (0, wt, subkeys[19]);
   EXPAND_KEY (80); SBOX (7, ws, subkeys[20]); SBOX (6, wt, subkeys[21]);
   EXPAND_KEY (88); SBOX (5, ws, subkeys[22]); SBOX (4, wt, subkeys[23]);
   EXPAND_KEY (96); SBOX (3, ws, subkeys[24]); SBOX (2, wt, subkeys[25]);
   EXPAND_KEY (104); SBOX (1, ws, subkeys[26]); SBOX (0, wt, subkeys[27]);
   EXPAND_KEY (112); SBOX (7, ws, subkeys[28]); SBOX (6, wt, subkeys[29]);
   EXPAND_KEY (120); SBOX (5, ws, subkeys[30]); SBOX (4, wt, subkeys[31]);
   EXPAND_KEY4 (ws, 128); SBOX (3, ws, subkeys[32]);
 
   wipememory (ws, sizeof (ws));
   wipememory (wt, sizeof (wt));
   wipememory (w, sizeof (w));
 }
 
 /* Initialize CONTEXT with the key KEY of KEY_LENGTH bits.  */
 static void
 serpent_setkey_internal (serpent_context_t *context,
 			 const byte *key, unsigned int key_length)
 {
   serpent_key_t key_prepared;
 
   serpent_key_prepare (key, key_length, key_prepared);
   serpent_subkeys_generate (key_prepared, context->keys);
 
 #ifdef USE_AVX2
   context->use_avx2 = 0;
   if ((_gcry_get_hw_features () & HWF_INTEL_AVX2))
     {
       context->use_avx2 = 1;
     }
 #endif
 
 #ifdef USE_NEON
   context->use_neon = 0;
   if ((_gcry_get_hw_features () & HWF_ARM_NEON))
     {
       context->use_neon = 1;
     }
 #endif
 
   wipememory (key_prepared, sizeof(key_prepared));
 }
 
 /* Initialize CTX with the key KEY of KEY_LENGTH bytes.  */
 static gcry_err_code_t
 serpent_setkey (void *ctx,
 		const byte *key, unsigned int key_length)
 {
   serpent_context_t *context = ctx;
   static const char *serpent_test_ret;
   static int serpent_init_done;
   gcry_err_code_t ret = GPG_ERR_NO_ERROR;
 
   if (! serpent_init_done)
     {
       /* Execute a self-test the first time, Serpent is used.  */
       serpent_init_done = 1;
       serpent_test_ret = serpent_test ();
       if (serpent_test_ret)
 	log_error ("Serpent test failure: %s\n", serpent_test_ret);
     }
 
   if (serpent_test_ret)
     ret = GPG_ERR_SELFTEST_FAILED;
   else
     serpent_setkey_internal (context, key, key_length);
 
   return ret;
 }
 
 static void
 serpent_encrypt_internal (serpent_context_t *context,
 			  const byte *input, byte *output)
 {
   serpent_block_t b, b_next;
   int round = 0;
 
   b[0] = buf_get_le32 (input + 0);
   b[1] = buf_get_le32 (input + 4);
   b[2] = buf_get_le32 (input + 8);
   b[3] = buf_get_le32 (input + 12);
 
   ROUND (0, context->keys, b, b_next);
   ROUND (1, context->keys, b, b_next);
   ROUND (2, context->keys, b, b_next);
   ROUND (3, context->keys, b, b_next);
   ROUND (4, context->keys, b, b_next);
   ROUND (5, context->keys, b, b_next);
   ROUND (6, context->keys, b, b_next);
   ROUND (7, context->keys, b, b_next);
   ROUND (0, context->keys, b, b_next);
   ROUND (1, context->keys, b, b_next);
   ROUND (2, context->keys, b, b_next);
   ROUND (3, context->keys, b, b_next);
   ROUND (4, context->keys, b, b_next);
   ROUND (5, context->keys, b, b_next);
   ROUND (6, context->keys, b, b_next);
   ROUND (7, context->keys, b, b_next);
   ROUND (0, context->keys, b, b_next);
   ROUND (1, context->keys, b, b_next);
   ROUND (2, context->keys, b, b_next);
   ROUND (3, context->keys, b, b_next);
   ROUND (4, context->keys, b, b_next);
   ROUND (5, context->keys, b, b_next);
   ROUND (6, context->keys, b, b_next);
   ROUND (7, context->keys, b, b_next);
   ROUND (0, context->keys, b, b_next);
   ROUND (1, context->keys, b, b_next);
   ROUND (2, context->keys, b, b_next);
   ROUND (3, context->keys, b, b_next);
   ROUND (4, context->keys, b, b_next);
   ROUND (5, context->keys, b, b_next);
   ROUND (6, context->keys, b, b_next);
 
   ROUND_LAST (7, context->keys, b, b_next);
 
   buf_put_le32 (output + 0, b_next[0]);
   buf_put_le32 (output + 4, b_next[1]);
   buf_put_le32 (output + 8, b_next[2]);
   buf_put_le32 (output + 12, b_next[3]);
 }
 
 static void
 serpent_decrypt_internal (serpent_context_t *context,
 			  const byte *input, byte *output)
 {
   serpent_block_t b, b_next;
   int round = ROUNDS;
 
   b_next[0] = buf_get_le32 (input + 0);
   b_next[1] = buf_get_le32 (input + 4);
   b_next[2] = buf_get_le32 (input + 8);
   b_next[3] = buf_get_le32 (input + 12);
 
   ROUND_FIRST_INVERSE (7, context->keys, b_next, b);
 
   ROUND_INVERSE (6, context->keys, b, b_next);
   ROUND_INVERSE (5, context->keys, b, b_next);
   ROUND_INVERSE (4, context->keys, b, b_next);
   ROUND_INVERSE (3, context->keys, b, b_next);
   ROUND_INVERSE (2, context->keys, b, b_next);
   ROUND_INVERSE (1, context->keys, b, b_next);
   ROUND_INVERSE (0, context->keys, b, b_next);
   ROUND_INVERSE (7, context->keys, b, b_next);
   ROUND_INVERSE (6, context->keys, b, b_next);
   ROUND_INVERSE (5, context->keys, b, b_next);
   ROUND_INVERSE (4, context->keys, b, b_next);
   ROUND_INVERSE (3, context->keys, b, b_next);
   ROUND_INVERSE (2, context->keys, b, b_next);
   ROUND_INVERSE (1, context->keys, b, b_next);
   ROUND_INVERSE (0, context->keys, b, b_next);
   ROUND_INVERSE (7, context->keys, b, b_next);
   ROUND_INVERSE (6, context->keys, b, b_next);
   ROUND_INVERSE (5, context->keys, b, b_next);
   ROUND_INVERSE (4, context->keys, b, b_next);
   ROUND_INVERSE (3, context->keys, b, b_next);
   ROUND_INVERSE (2, context->keys, b, b_next);
   ROUND_INVERSE (1, context->keys, b, b_next);
   ROUND_INVERSE (0, context->keys, b, b_next);
   ROUND_INVERSE (7, context->keys, b, b_next);
   ROUND_INVERSE (6, context->keys, b, b_next);
   ROUND_INVERSE (5, context->keys, b, b_next);
   ROUND_INVERSE (4, context->keys, b, b_next);
   ROUND_INVERSE (3, context->keys, b, b_next);
   ROUND_INVERSE (2, context->keys, b, b_next);
   ROUND_INVERSE (1, context->keys, b, b_next);
   ROUND_INVERSE (0, context->keys, b, b_next);
 
   buf_put_le32 (output + 0, b_next[0]);
   buf_put_le32 (output + 4, b_next[1]);
   buf_put_le32 (output + 8, b_next[2]);
   buf_put_le32 (output + 12, b_next[3]);
 }
 
 static unsigned int
 serpent_encrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
 {
   serpent_context_t *context = ctx;
 
   serpent_encrypt_internal (context, buffer_in, buffer_out);
   return /*burn_stack*/ (2 * sizeof (serpent_block_t));
 }
 
 static unsigned int
 serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
 {
   serpent_context_t *context = ctx;
 
   serpent_decrypt_internal (context, buffer_in, buffer_out);
   return /*burn_stack*/ (2 * sizeof (serpent_block_t));
 }
 
 
 
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
    of size sizeof(serpent_block_t). */
 void
 _gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
                       void *outbuf_arg, const void *inbuf_arg,
                       size_t nblocks)
 {
   serpent_context_t *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char tmpbuf[sizeof(serpent_block_t)];
   int burn_stack_depth = 2 * sizeof (serpent_block_t);
   int i;
 
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     {
       int did_use_avx2 = 0;
 
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
         {
           _gcry_serpent_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
 
           nblocks -= 16;
           outbuf += 16 * sizeof(serpent_block_t);
           inbuf  += 16 * sizeof(serpent_block_t);
           did_use_avx2 = 1;
         }
 
       if (did_use_avx2)
         {
           /* serpent-avx2 assembly code does not use stack */
           if (nblocks == 0)
             burn_stack_depth = 0;
         }
 
       /* Use generic/sse2 code to handle smaller chunks... */
       /* TODO: use caching instead? */
     }
 #endif
 
 #ifdef USE_SSE2
   {
     int did_use_sse2 = 0;
 
     /* Process data in 8 block chunks. */
     while (nblocks >= 8)
       {
         _gcry_serpent_sse2_ctr_enc(ctx, outbuf, inbuf, ctr);
 
         nblocks -= 8;
         outbuf += 8 * sizeof(serpent_block_t);
         inbuf  += 8 * sizeof(serpent_block_t);
         did_use_sse2 = 1;
       }
 
     if (did_use_sse2)
       {
         /* serpent-sse2 assembly code does not use stack */
         if (nblocks == 0)
           burn_stack_depth = 0;
       }
 
     /* Use generic code to handle smaller chunks... */
     /* TODO: use caching instead? */
   }
 #endif
 
 #ifdef USE_NEON
   if (ctx->use_neon)
     {
       int did_use_neon = 0;
 
       /* Process data in 8 block chunks. */
       while (nblocks >= 8)
         {
           _gcry_serpent_neon_ctr_enc(ctx, outbuf, inbuf, ctr);
 
           nblocks -= 8;
           outbuf += 8 * sizeof(serpent_block_t);
           inbuf  += 8 * sizeof(serpent_block_t);
           did_use_neon = 1;
         }
 
       if (did_use_neon)
         {
           /* serpent-neon assembly code does not use stack */
           if (nblocks == 0)
             burn_stack_depth = 0;
         }
 
       /* Use generic code to handle smaller chunks... */
       /* TODO: use caching instead? */
     }
 #endif
 
   for ( ;nblocks; nblocks-- )
     {
       /* Encrypt the counter. */
       serpent_encrypt_internal(ctx, ctr, tmpbuf);
       /* XOR the input with the encrypted counter and store in output.  */
       buf_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t));
       outbuf += sizeof(serpent_block_t);
       inbuf  += sizeof(serpent_block_t);
       /* Increment the counter.  */
       for (i = sizeof(serpent_block_t); i > 0; i--)
         {
           ctr[i-1]++;
           if (ctr[i-1])
             break;
         }
     }
 
   wipememory(tmpbuf, sizeof(tmpbuf));
   _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CBC mode.  This function is only
    intended for the bulk encryption feature of cipher.c. */
 void
 _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
                       void *outbuf_arg, const void *inbuf_arg,
                       size_t nblocks)
 {
   serpent_context_t *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char savebuf[sizeof(serpent_block_t)];
   int burn_stack_depth = 2 * sizeof (serpent_block_t);
 
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     {
       int did_use_avx2 = 0;
 
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
         {
           _gcry_serpent_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
 
           nblocks -= 16;
           outbuf += 16 * sizeof(serpent_block_t);
           inbuf  += 16 * sizeof(serpent_block_t);
           did_use_avx2 = 1;
         }
 
       if (did_use_avx2)
         {
           /* serpent-avx2 assembly code does not use stack */
           if (nblocks == 0)
             burn_stack_depth = 0;
         }
 
       /* Use generic/sse2 code to handle smaller chunks... */
     }
 #endif
 
 #ifdef USE_SSE2
   {
     int did_use_sse2 = 0;
 
     /* Process data in 8 block chunks. */
     while (nblocks >= 8)
       {
         _gcry_serpent_sse2_cbc_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 8;
         outbuf += 8 * sizeof(serpent_block_t);
         inbuf  += 8 * sizeof(serpent_block_t);
         did_use_sse2 = 1;
       }
 
     if (did_use_sse2)
       {
         /* serpent-sse2 assembly code does not use stack */
         if (nblocks == 0)
           burn_stack_depth = 0;
       }
 
     /* Use generic code to handle smaller chunks... */
   }
 #endif
 
 #ifdef USE_NEON
   if (ctx->use_neon)
     {
       int did_use_neon = 0;
 
       /* Process data in 8 block chunks. */
       while (nblocks >= 8)
         {
           _gcry_serpent_neon_cbc_dec(ctx, outbuf, inbuf, iv);
 
           nblocks -= 8;
           outbuf += 8 * sizeof(serpent_block_t);
           inbuf  += 8 * sizeof(serpent_block_t);
           did_use_neon = 1;
         }
 
       if (did_use_neon)
         {
           /* serpent-neon assembly code does not use stack */
           if (nblocks == 0)
             burn_stack_depth = 0;
         }
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
   for ( ;nblocks; nblocks-- )
     {
       /* INBUF is needed later and it may be identical to OUTBUF, so store
          the intermediate result to SAVEBUF.  */
       serpent_decrypt_internal (ctx, inbuf, savebuf);
 
       buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, sizeof(serpent_block_t));
       inbuf += sizeof(serpent_block_t);
       outbuf += sizeof(serpent_block_t);
     }
 
   wipememory(savebuf, sizeof(savebuf));
   _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CFB mode.  This function is only
    intended for the bulk encryption feature of cipher.c. */
 void
 _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
                       void *outbuf_arg, const void *inbuf_arg,
                       size_t nblocks)
 {
   serpent_context_t *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 2 * sizeof (serpent_block_t);
 
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     {
       int did_use_avx2 = 0;
 
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
         {
           _gcry_serpent_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
 
           nblocks -= 16;
           outbuf += 16 * sizeof(serpent_block_t);
           inbuf  += 16 * sizeof(serpent_block_t);
           did_use_avx2 = 1;
         }
 
       if (did_use_avx2)
         {
           /* serpent-avx2 assembly code does not use stack */
           if (nblocks == 0)
             burn_stack_depth = 0;
         }
 
       /* Use generic/sse2 code to handle smaller chunks... */
     }
 #endif
 
 #ifdef USE_SSE2
   {
     int did_use_sse2 = 0;
 
     /* Process data in 8 block chunks. */
     while (nblocks >= 8)
       {
         _gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 8;
         outbuf += 8 * sizeof(serpent_block_t);
         inbuf  += 8 * sizeof(serpent_block_t);
         did_use_sse2 = 1;
       }
 
     if (did_use_sse2)
       {
         /* serpent-sse2 assembly code does not use stack */
         if (nblocks == 0)
           burn_stack_depth = 0;
       }
 
     /* Use generic code to handle smaller chunks... */
   }
 #endif
 
 #ifdef USE_NEON
   if (ctx->use_neon)
     {
       int did_use_neon = 0;
 
       /* Process data in 8 block chunks. */
       while (nblocks >= 8)
         {
           _gcry_serpent_neon_cfb_dec(ctx, outbuf, inbuf, iv);
 
           nblocks -= 8;
           outbuf += 8 * sizeof(serpent_block_t);
           inbuf  += 8 * sizeof(serpent_block_t);
           did_use_neon = 1;
         }
 
       if (did_use_neon)
         {
           /* serpent-neon assembly code does not use stack */
           if (nblocks == 0)
             burn_stack_depth = 0;
         }
 
       /* Use generic code to handle smaller chunks... */
     }
 #endif
 
   for ( ;nblocks; nblocks-- )
     {
       serpent_encrypt_internal(ctx, iv, iv);
       buf_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t));
       outbuf += sizeof(serpent_block_t);
       inbuf  += sizeof(serpent_block_t);
     }
 
   _gcry_burn_stack(burn_stack_depth);
 }
 
 
 
 /* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 static const char*
 selftest_ctr_128 (void)
 {
   const int nblocks = 16+8+1;
   const int blocksize = sizeof(serpent_block_t);
   const int context_size = sizeof(serpent_context_t);
 
   return _gcry_selftest_helper_ctr("SERPENT", &serpent_setkey,
            &serpent_encrypt, &_gcry_serpent_ctr_enc, nblocks, blocksize,
 	   context_size);
 }
 
 
 /* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
    Returns NULL on success. */
 static const char*
 selftest_cbc_128 (void)
 {
   const int nblocks = 16+8+2;
   const int blocksize = sizeof(serpent_block_t);
   const int context_size = sizeof(serpent_context_t);
 
   return _gcry_selftest_helper_cbc("SERPENT", &serpent_setkey,
            &serpent_encrypt, &_gcry_serpent_cbc_dec, nblocks, blocksize,
 	   context_size);
 }
 
 
 /* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
    Returns NULL on success. */
 static const char*
 selftest_cfb_128 (void)
 {
   const int nblocks = 16+8+2;
   const int blocksize = sizeof(serpent_block_t);
   const int context_size = sizeof(serpent_context_t);
 
   return _gcry_selftest_helper_cfb("SERPENT", &serpent_setkey,
            &serpent_encrypt, &_gcry_serpent_cfb_dec, nblocks, blocksize,
 	   context_size);
 }
 
 
 /* Serpent test.  */
 
 static const char *
 serpent_test (void)
 {
   serpent_context_t context;
   unsigned char scratch[16];
   unsigned int i;
   const char *r;
 
   static struct test
   {
     int key_length;
     unsigned char key[32];
     unsigned char text_plain[16];
     unsigned char text_cipher[16];
   } test_data[] =
     {
       {
 	16,
 	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
 	"\xD2\x9D\x57\x6F\xCE\xA3\xA3\xA7\xED\x90\x99\xF2\x92\x73\xD7\x8E",
 	"\xB2\x28\x8B\x96\x8A\xE8\xB0\x86\x48\xD1\xCE\x96\x06\xFD\x99\x2D"
       },
       {
 	24,
 	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	"\x00\x00\x00\x00\x00\x00\x00\x00",
 	"\xD2\x9D\x57\x6F\xCE\xAB\xA3\xA7\xED\x98\x99\xF2\x92\x7B\xD7\x8E",
 	"\x13\x0E\x35\x3E\x10\x37\xC2\x24\x05\xE8\xFA\xEF\xB2\xC3\xC3\xE9"
       },
       {
 	32,
 	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
 	"\xD0\x95\x57\x6F\xCE\xA3\xE3\xA7\xED\x98\xD9\xF2\x90\x73\xD7\x8E",
 	"\xB9\x0E\xE5\x86\x2D\xE6\x91\x68\xF2\xBD\xD5\x12\x5B\x45\x47\x2B"
       },
       {
 	32,
 	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
 	"\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00",
 	"\x20\x61\xA4\x27\x82\xBD\x52\xEC\x69\x1E\xC3\x83\xB0\x3B\xA7\x7C"
       },
       {
 	0
       },
     };
 
   for (i = 0; test_data[i].key_length; i++)
     {
       serpent_setkey_internal (&context, test_data[i].key,
                                test_data[i].key_length);
       serpent_encrypt_internal (&context, test_data[i].text_plain, scratch);
 
       if (memcmp (scratch, test_data[i].text_cipher, sizeof (serpent_block_t)))
 	switch (test_data[i].key_length)
 	  {
 	  case 16:
 	    return "Serpent-128 test encryption failed.";
 	  case  24:
 	    return "Serpent-192 test encryption failed.";
 	  case 32:
 	    return "Serpent-256 test encryption failed.";
 	  }
 
     serpent_decrypt_internal (&context, test_data[i].text_cipher, scratch);
     if (memcmp (scratch, test_data[i].text_plain, sizeof (serpent_block_t)))
       switch (test_data[i].key_length)
 	{
 	case 16:
 	  return "Serpent-128 test decryption failed.";
 	case  24:
 	  return "Serpent-192 test decryption failed.";
 	case 32:
 	  return "Serpent-256 test decryption failed.";
 	}
     }
 
   if ( (r = selftest_ctr_128 ()) )
     return r;
 
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
   if ( (r = selftest_cfb_128 ()) )
     return r;
 
   return NULL;
 }
 
 
 
 /* "SERPENT" is an alias for "SERPENT128".  */
 static const char *cipher_spec_serpent128_aliases[] =
   {
     "SERPENT",
     NULL
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_serpent128 =
   {
     GCRY_CIPHER_SERPENT128, {0, 0},
     "SERPENT128", cipher_spec_serpent128_aliases, NULL, 16, 128,
     sizeof (serpent_context_t),
     serpent_setkey, serpent_encrypt, serpent_decrypt
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_serpent192 =
   {
     GCRY_CIPHER_SERPENT192, {0, 0},
     "SERPENT192", NULL, NULL, 16, 192,
     sizeof (serpent_context_t),
     serpent_setkey, serpent_encrypt, serpent_decrypt
   };
 
 gcry_cipher_spec_t _gcry_cipher_spec_serpent256 =
   {
     GCRY_CIPHER_SERPENT256, {0, 0},
     "SERPENT256", NULL, NULL, 16, 256,
     sizeof (serpent_context_t),
     serpent_setkey, serpent_encrypt, serpent_decrypt
   };
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 65bd6866..00c57dd4 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -1,551 +1,552 @@
 /* sha1.c - SHA1 hash function
  * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 
 /*  Test vectors:
  *
  *  "abc"
  *  A999 3E36 4706 816A BA3E  2571 7850 C26C 9CD0 D89D
  *
  *  "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
  *  8498 3E44 1C3B D26E BAAE  4AA1 F951 29E5 E546 70F1
  */
 
 
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef HAVE_STDINT_H
 # include <stdint.h>
 #endif
 
 #include "g10lib.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3)
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX indicates whether to compile with Intel AVX code. */
 #undef USE_AVX
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX)
 # define USE_AVX 1
 #endif
 
 /* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */
 #undef USE_BMI2
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(HAVE_GCC_INLINE_ASM_BMI2)
 # define USE_BMI2 1
 #endif
 
 /* USE_NEON indicates whether to enable ARM NEON assembly code. */
 #undef USE_NEON
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-     defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_NEON 1
 # endif
-#endif
+#endif /*ENABLE_NEON_SUPPORT*/
 
 
 /* A macro to test whether P is properly aligned for an u32 type.
    Note that config.h provides a suitable replacement for uintptr_t if
    it does not exist in stdint.h.  */
 /* #if __GNUC__ >= 2 */
 /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % __alignof__ (u32))) */
 /* #else */
 /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % sizeof (u32))) */
 /* #endif */
 
 typedef struct
 {
   gcry_md_block_ctx_t bctx;
   u32           h0,h1,h2,h3,h4;
 #ifdef USE_SSSE3
   unsigned int use_ssse3:1;
 #endif
 #ifdef USE_AVX
   unsigned int use_avx:1;
 #endif
 #ifdef USE_BMI2
   unsigned int use_bmi2:1;
 #endif
 #ifdef USE_NEON
   unsigned int use_neon:1;
 #endif
 } SHA1_CONTEXT;
 
 static unsigned int
 transform (void *c, const unsigned char *data, size_t nblks);
 
 
 static void
 sha1_init (void *context, unsigned int flags)
 {
   SHA1_CONTEXT *hd = context;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = 0x67452301;
   hd->h1 = 0xefcdab89;
   hd->h2 = 0x98badcfe;
   hd->h3 = 0x10325476;
   hd->h4 = 0xc3d2e1f0;
 
   hd->bctx.nblocks = 0;
   hd->bctx.nblocks_high = 0;
   hd->bctx.count = 0;
   hd->bctx.blocksize = 64;
   hd->bctx.bwrite = transform;
 
 #ifdef USE_SSSE3
   hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
   hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
 #endif
 #ifdef USE_BMI2
   hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2);
 #endif
 #ifdef USE_NEON
   hd->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
   (void)features;
 }
 
 
 /* Round function macros. */
 #define K1  0x5A827999L
 #define K2  0x6ED9EBA1L
 #define K3  0x8F1BBCDCL
 #define K4  0xCA62C1D6L
 #define F1(x,y,z)   ( z ^ ( x & ( y ^ z ) ) )
 #define F2(x,y,z)   ( x ^ y ^ z )
 #define F3(x,y,z)   ( ( x & y ) | ( z & ( x | y ) ) )
 #define F4(x,y,z)   ( x ^ y ^ z )
 #define M(i) ( tm =    x[ i    &0x0f]  \
                      ^ x[(i-14)&0x0f]  \
 	 	     ^ x[(i-8) &0x0f]  \
                      ^ x[(i-3) &0x0f], \
                      (x[i&0x0f] = rol(tm, 1)))
 #define R(a,b,c,d,e,f,k,m)  do { e += rol( a, 5 )     \
 	                              + f( b, c, d )  \
 		 		      + k	      \
 			 	      + m;	      \
 				 b = rol( b, 30 );    \
 			       } while(0)
 
 
 
 #ifdef USE_NEON
 unsigned int
 _gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data,
                                  size_t nblks);
 #endif
 
 /*
  * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA.
  */
 static unsigned int
 transform_blk (void *ctx, const unsigned char *data)
 {
   SHA1_CONTEXT *hd = ctx;
   const u32 *idata = (const void *)data;
   register u32 a, b, c, d, e; /* Local copies of the chaining variables.  */
   register u32 tm;            /* Helper.  */
   u32 x[16];                  /* The array we work on. */
 
 #define I(i) (x[i] = buf_get_be32(idata + i))
 
       /* Get the values of the chaining variables. */
       a = hd->h0;
       b = hd->h1;
       c = hd->h2;
       d = hd->h3;
       e = hd->h4;
 
       /* Transform. */
       R( a, b, c, d, e, F1, K1, I( 0) );
       R( e, a, b, c, d, F1, K1, I( 1) );
       R( d, e, a, b, c, F1, K1, I( 2) );
       R( c, d, e, a, b, F1, K1, I( 3) );
       R( b, c, d, e, a, F1, K1, I( 4) );
       R( a, b, c, d, e, F1, K1, I( 5) );
       R( e, a, b, c, d, F1, K1, I( 6) );
       R( d, e, a, b, c, F1, K1, I( 7) );
       R( c, d, e, a, b, F1, K1, I( 8) );
       R( b, c, d, e, a, F1, K1, I( 9) );
       R( a, b, c, d, e, F1, K1, I(10) );
       R( e, a, b, c, d, F1, K1, I(11) );
       R( d, e, a, b, c, F1, K1, I(12) );
       R( c, d, e, a, b, F1, K1, I(13) );
       R( b, c, d, e, a, F1, K1, I(14) );
       R( a, b, c, d, e, F1, K1, I(15) );
       R( e, a, b, c, d, F1, K1, M(16) );
       R( d, e, a, b, c, F1, K1, M(17) );
       R( c, d, e, a, b, F1, K1, M(18) );
       R( b, c, d, e, a, F1, K1, M(19) );
       R( a, b, c, d, e, F2, K2, M(20) );
       R( e, a, b, c, d, F2, K2, M(21) );
       R( d, e, a, b, c, F2, K2, M(22) );
       R( c, d, e, a, b, F2, K2, M(23) );
       R( b, c, d, e, a, F2, K2, M(24) );
       R( a, b, c, d, e, F2, K2, M(25) );
       R( e, a, b, c, d, F2, K2, M(26) );
       R( d, e, a, b, c, F2, K2, M(27) );
       R( c, d, e, a, b, F2, K2, M(28) );
       R( b, c, d, e, a, F2, K2, M(29) );
       R( a, b, c, d, e, F2, K2, M(30) );
       R( e, a, b, c, d, F2, K2, M(31) );
       R( d, e, a, b, c, F2, K2, M(32) );
       R( c, d, e, a, b, F2, K2, M(33) );
       R( b, c, d, e, a, F2, K2, M(34) );
       R( a, b, c, d, e, F2, K2, M(35) );
       R( e, a, b, c, d, F2, K2, M(36) );
       R( d, e, a, b, c, F2, K2, M(37) );
       R( c, d, e, a, b, F2, K2, M(38) );
       R( b, c, d, e, a, F2, K2, M(39) );
       R( a, b, c, d, e, F3, K3, M(40) );
       R( e, a, b, c, d, F3, K3, M(41) );
       R( d, e, a, b, c, F3, K3, M(42) );
       R( c, d, e, a, b, F3, K3, M(43) );
       R( b, c, d, e, a, F3, K3, M(44) );
       R( a, b, c, d, e, F3, K3, M(45) );
       R( e, a, b, c, d, F3, K3, M(46) );
       R( d, e, a, b, c, F3, K3, M(47) );
       R( c, d, e, a, b, F3, K3, M(48) );
       R( b, c, d, e, a, F3, K3, M(49) );
       R( a, b, c, d, e, F3, K3, M(50) );
       R( e, a, b, c, d, F3, K3, M(51) );
       R( d, e, a, b, c, F3, K3, M(52) );
       R( c, d, e, a, b, F3, K3, M(53) );
       R( b, c, d, e, a, F3, K3, M(54) );
       R( a, b, c, d, e, F3, K3, M(55) );
       R( e, a, b, c, d, F3, K3, M(56) );
       R( d, e, a, b, c, F3, K3, M(57) );
       R( c, d, e, a, b, F3, K3, M(58) );
       R( b, c, d, e, a, F3, K3, M(59) );
       R( a, b, c, d, e, F4, K4, M(60) );
       R( e, a, b, c, d, F4, K4, M(61) );
       R( d, e, a, b, c, F4, K4, M(62) );
       R( c, d, e, a, b, F4, K4, M(63) );
       R( b, c, d, e, a, F4, K4, M(64) );
       R( a, b, c, d, e, F4, K4, M(65) );
       R( e, a, b, c, d, F4, K4, M(66) );
       R( d, e, a, b, c, F4, K4, M(67) );
       R( c, d, e, a, b, F4, K4, M(68) );
       R( b, c, d, e, a, F4, K4, M(69) );
       R( a, b, c, d, e, F4, K4, M(70) );
       R( e, a, b, c, d, F4, K4, M(71) );
       R( d, e, a, b, c, F4, K4, M(72) );
       R( c, d, e, a, b, F4, K4, M(73) );
       R( b, c, d, e, a, F4, K4, M(74) );
       R( a, b, c, d, e, F4, K4, M(75) );
       R( e, a, b, c, d, F4, K4, M(76) );
       R( d, e, a, b, c, F4, K4, M(77) );
       R( c, d, e, a, b, F4, K4, M(78) );
       R( b, c, d, e, a, F4, K4, M(79) );
 
       /* Update the chaining variables. */
       hd->h0 += a;
       hd->h1 += b;
       hd->h2 += c;
       hd->h3 += d;
       hd->h4 += e;
 
   return /* burn_stack */ 88+4*sizeof(void*);
 }
 
 
 #ifdef USE_SSSE3
 unsigned int
 _gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data,
                                   size_t nblks);
 #endif
 
 #ifdef USE_AVX
 unsigned int
 _gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data,
                                  size_t nblks);
 #endif
 
 #ifdef USE_BMI2
 unsigned int
 _gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
                                      size_t nblks);
 #endif
 
 
 static unsigned int
 transform (void *ctx, const unsigned char *data, size_t nblks)
 {
   SHA1_CONTEXT *hd = ctx;
   unsigned int burn;
 
 #ifdef USE_BMI2
   if (hd->use_bmi2)
     return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
            + 4 * sizeof(void*);
 #endif
 #ifdef USE_AVX
   if (hd->use_avx)
     return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks)
            + 4 * sizeof(void*);
 #endif
 #ifdef USE_SSSE3
   if (hd->use_ssse3)
     return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks)
            + 4 * sizeof(void*);
 #endif
 #ifdef USE_NEON
   if (hd->use_neon)
     return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks)
            + 4 * sizeof(void*);
 #endif
 
   do
     {
       burn = transform_blk (hd, data);
       data += 64;
     }
   while (--nblks);
 
   return burn;
 }
 
 
 /* The routine final terminates the computation and
  * returns the digest.
  * The handle is prepared for a new cycle, but adding bytes to the
  * handle will the destroy the returned buffer.
  * Returns: 20 bytes representing the digest.
  */
 
 static void
 sha1_final(void *context)
 {
   SHA1_CONTEXT *hd = context;
   u32 t, th, msb, lsb;
   unsigned char *p;
   unsigned int burn;
 
   _gcry_md_block_write (hd, NULL, 0); /* flush */;
 
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
   else
     th = hd->bctx.nblocks >> 32;
 
   /* multiply by 64 to make a byte count */
   lsb = t << 6;
   msb = (th << 6) | (t >> 26);
   /* add the count */
   t = lsb;
   if( (lsb += hd->bctx.count) < t )
     msb++;
   /* multiply by 8 to make a bit count */
   t = lsb;
   lsb <<= 3;
   msb <<= 3;
   msb |= t >> 29;
 
   if( hd->bctx.count < 56 )  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
       while( hd->bctx.count < 56 )
         hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
     }
   else  /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
       while( hd->bctx.count < 64 )
         hd->bctx.buf[hd->bctx.count++] = 0;
       _gcry_md_block_write(hd, NULL, 0);  /* flush */;
       memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
   buf_put_be32(hd->bctx.buf + 56, msb);
   buf_put_be32(hd->bctx.buf + 60, lsb);
   burn = transform( hd, hd->bctx.buf, 1 );
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { *(u32*)p = be_bswap32(hd->h##a) ; p += 4; } while(0)
   X(0);
   X(1);
   X(2);
   X(3);
   X(4);
 #undef X
 
 }
 
 static unsigned char *
 sha1_read( void *context )
 {
   SHA1_CONTEXT *hd = context;
 
   return hd->bctx.buf;
 }
 
 /****************
  * Shortcut functions which puts the hash value of the supplied buffer
  * into outbuf which must have a size of 20 bytes.
  */
 void
 _gcry_sha1_hash_buffer (void *outbuf, const void *buffer, size_t length)
 {
   SHA1_CONTEXT hd;
 
   sha1_init (&hd, 0);
   _gcry_md_block_write (&hd, buffer, length);
   sha1_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 20);
 }
 
 
 /* Variant of the above shortcut function using a multiple buffers.  */
 void
 _gcry_sha1_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 {
   SHA1_CONTEXT hd;
 
   sha1_init (&hd, 0);
   for (;iovcnt > 0; iov++, iovcnt--)
     _gcry_md_block_write (&hd,
                           (const char*)iov[0].data + iov[0].off, iov[0].len);
   sha1_final (&hd);
   memcpy (outbuf, hd.bctx.buf, 20);
 }
 
 
 
 /*
      Self-test section.
  */
 
 
 static gpg_err_code_t
 selftests_sha1 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA1, 0,
      "abc", 3,
      "\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E"
      "\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D", 20);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA1, 0,
          "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
          "\x84\x98\x3E\x44\x1C\x3B\xD2\x6E\xBA\xAE"
          "\x4A\xA1\xF9\x51\x29\xE5\xE5\x46\x70\xF1", 20);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA1, 1,
          NULL, 0,
          "\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E"
          "\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F", 20);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA1, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
   gpg_err_code_t ec;
 
   switch (algo)
     {
     case GCRY_MD_SHA1:
       ec = selftests_sha1 (extended, report);
       break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
 
     }
   return ec;
 }
 
 
 
 
 static unsigned char asn[15] = /* Object ID is 1.3.14.3.2.26 */
   { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03,
     0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 };
 
 static gcry_md_oid_spec_t oid_spec_sha1[] =
   {
     /* iso.member-body.us.rsadsi.pkcs.pkcs-1.5 (sha1WithRSAEncryption) */
     { "1.2.840.113549.1.1.5" },
     /* iso.member-body.us.x9-57.x9cm.3 (dsaWithSha1)*/
     { "1.2.840.10040.4.3" },
     /* from NIST's OIW  (sha1) */
     { "1.3.14.3.2.26" },
     /* from NIST OIW (sha-1WithRSAEncryption) */
     { "1.3.14.3.2.29" },
     /* iso.member-body.us.ansi-x9-62.signatures.ecdsa-with-sha1 */
     { "1.2.840.10045.4.1" },
     { NULL },
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha1 =
   {
     GCRY_MD_SHA1, {0, 1},
     "SHA1", asn, DIM (asn), oid_spec_sha1, 20,
     sha1_init, _gcry_md_block_write, sha1_final, sha1_read,
     sizeof (SHA1_CONTEXT),
     run_selftests
   };
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 92b49131..7d60df0f 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -1,878 +1,879 @@
 /* sha512.c - SHA384 and SHA512 hash functions
  * Copyright (C) 2003, 2008, 2009 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser general Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 
 /*  Test vectors from FIPS-180-2:
  *
  *  "abc"
  * 384:
  *  CB00753F 45A35E8B B5A03D69 9AC65007 272C32AB 0EDED163
  *  1A8B605A 43FF5BED 8086072B A1E7CC23 58BAECA1 34C825A7
  * 512:
  *  DDAF35A1 93617ABA CC417349 AE204131 12E6FA4E 89A97EA2 0A9EEEE6 4B55D39A
  *  2192992A 274FC1A8 36BA3C23 A3FEEBBD 454D4423 643CE80E 2A9AC94F A54CA49F
  *
  *  "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu"
  * 384:
  *  09330C33 F71147E8 3D192FC7 82CD1B47 53111B17 3B3B05D2
  *  2FA08086 E3B0F712 FCC7C71A 557E2DB9 66C3E9FA 91746039
  * 512:
  *  8E959B75 DAE313DA 8CF4F728 14FC143F 8F7779C6 EB9F7FA1 7299AEAD B6889018
  *  501D289E 4900F7E4 331B99DE C4B5433A C7D329EE B6DD2654 5E96E55B 874BE909
  *
  *  "a" x 1000000
  * 384:
  *  9D0E1809 716474CB 086E834E 310A4A1C ED149E9C 00F24852
  *  7972CEC5 704C2A5B 07B8B3DC 38ECC4EB AE97DDD8 7F3D8985
  * 512:
  *  E718483D 0CE76964 4E2E42C7 BC15B463 8E1F98B1 3B204428 5632A803 AFA973EB
  *  DE0FF244 877EA60A 4CB0432C E577C31B EB009C5C 2C49AA2E 4EADB217 AD8CC09B
  */
 
 
 #include <config.h>
 #include <string.h>
 #include "g10lib.h"
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
 
 /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
 #undef USE_ARM_NEON_ASM
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-     defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
 #  define USE_ARM_NEON_ASM 1
 # endif
-#endif
+#endif /*ENABLE_NEON_SUPPORT*/
 
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
 # define USE_SSSE3 1
 #endif
 
 
 /* USE_AVX indicates whether to compile with Intel AVX code. */
 #undef USE_AVX
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
 # define USE_AVX 1
 #endif
 
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
 # define USE_AVX2 1
 #endif
 
 
 typedef struct
 {
   u64 h0, h1, h2, h3, h4, h5, h6, h7;
 } SHA512_STATE;
 
 typedef struct
 {
   gcry_md_block_ctx_t bctx;
   SHA512_STATE state;
 #ifdef USE_ARM_NEON_ASM
   unsigned int use_neon:1;
 #endif
 #ifdef USE_SSSE3
   unsigned int use_ssse3:1;
 #endif
 #ifdef USE_AVX
   unsigned int use_avx:1;
 #endif
 #ifdef USE_AVX2
   unsigned int use_avx2:1;
 #endif
 } SHA512_CONTEXT;
 
 static unsigned int
 transform (void *context, const unsigned char *data, size_t nblks);
 
 static void
 sha512_init (void *context, unsigned int flags)
 {
   SHA512_CONTEXT *ctx = context;
   SHA512_STATE *hd = &ctx->state;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = U64_C(0x6a09e667f3bcc908);
   hd->h1 = U64_C(0xbb67ae8584caa73b);
   hd->h2 = U64_C(0x3c6ef372fe94f82b);
   hd->h3 = U64_C(0xa54ff53a5f1d36f1);
   hd->h4 = U64_C(0x510e527fade682d1);
   hd->h5 = U64_C(0x9b05688c2b3e6c1f);
   hd->h6 = U64_C(0x1f83d9abfb41bd6b);
   hd->h7 = U64_C(0x5be0cd19137e2179);
 
   ctx->bctx.nblocks = 0;
   ctx->bctx.nblocks_high = 0;
   ctx->bctx.count = 0;
   ctx->bctx.blocksize = 128;
   ctx->bctx.bwrite = transform;
 
 #ifdef USE_ARM_NEON_ASM
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_SSSE3
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX
   ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
 #endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
 #endif
 
   (void)features;
 }
 
 static void
 sha384_init (void *context, unsigned int flags)
 {
   SHA512_CONTEXT *ctx = context;
   SHA512_STATE *hd = &ctx->state;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
 
   hd->h0 = U64_C(0xcbbb9d5dc1059ed8);
   hd->h1 = U64_C(0x629a292a367cd507);
   hd->h2 = U64_C(0x9159015a3070dd17);
   hd->h3 = U64_C(0x152fecd8f70e5939);
   hd->h4 = U64_C(0x67332667ffc00b31);
   hd->h5 = U64_C(0x8eb44a8768581511);
   hd->h6 = U64_C(0xdb0c2e0d64f98fa7);
   hd->h7 = U64_C(0x47b5481dbefa4fa4);
 
   ctx->bctx.nblocks = 0;
   ctx->bctx.nblocks_high = 0;
   ctx->bctx.count = 0;
   ctx->bctx.blocksize = 128;
   ctx->bctx.bwrite = transform;
 
 #ifdef USE_ARM_NEON_ASM
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_SSSE3
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX
   ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
 #endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
 #endif
 
   (void)features;
 }
 
 
 static inline u64
 ROTR (u64 x, u64 n)
 {
   return ((x >> n) | (x << (64 - n)));
 }
 
 static inline u64
 Ch (u64 x, u64 y, u64 z)
 {
   return ((x & y) ^ ( ~x & z));
 }
 
 static inline u64
 Maj (u64 x, u64 y, u64 z)
 {
   return ((x & y) ^ (x & z) ^ (y & z));
 }
 
 static inline u64
 Sum0 (u64 x)
 {
   return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
 }
 
 static inline u64
 Sum1 (u64 x)
 {
   return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
 }
 
 static const u64 k[] =
   {
     U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
     U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
     U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
     U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
     U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
     U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
     U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
     U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
     U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
     U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
     U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
     U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
     U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
     U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
     U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
     U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
     U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
     U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
     U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
     U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
     U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
     U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
     U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
     U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
     U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
     U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
     U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
     U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
     U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
     U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
     U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
     U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
     U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
     U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
     U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
     U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
     U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
     U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
     U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
     U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
   };
 
 /****************
  * Transform the message W which consists of 16 64-bit-words
  */
 static unsigned int
 transform_blk (SHA512_STATE *hd, const unsigned char *data)
 {
   u64 a, b, c, d, e, f, g, h;
   u64 w[16];
   int t;
 
   /* get values from the chaining vars */
   a = hd->h0;
   b = hd->h1;
   c = hd->h2;
   d = hd->h3;
   e = hd->h4;
   f = hd->h5;
   g = hd->h6;
   h = hd->h7;
 
   for ( t = 0; t < 16; t++ )
     w[t] = buf_get_be64(data + t * 8);
 
 #define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
 #define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 
 
   for (t = 0; t < 80 - 16; )
     {
       u64 t1, t2;
 
       /* Performance on a AMD Athlon(tm) Dual Core Processor 4050e
          with gcc 4.3.3 using gcry_md_hash_buffer of each 10000 bytes
          initialized to 0,1,2,3...255,0,... and 1000 iterations:
 
          Not unrolled with macros:  440ms
          Unrolled with macros:      350ms
          Unrolled with inline:      330ms
       */
 #if 0 /* Not unrolled.  */
       t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
       w[t%16] += S1 (w[(t - 2)%16]) + w[(t - 7)%16] + S0 (w[(t - 15)%16]);
       t2 = Sum0 (a) + Maj (a, b, c);
       h = g;
       g = f;
       f = e;
       e = d + t1;
       d = c;
       c = b;
       b = a;
       a = t1 + t2;
       t++;
 #else /* Unrolled to interweave the chain variables.  */
       t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
       w[0] += S1 (w[14]) + w[9] + S0 (w[1]);
       t2 = Sum0 (a) + Maj (a, b, c);
       d += t1;
       h = t1 + t2;
 
       t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
       w[1] += S1 (w[15]) + w[10] + S0 (w[2]);
       t2 = Sum0 (h) + Maj (h, a, b);
       c += t1;
       g  = t1 + t2;
 
       t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
       w[2] += S1 (w[0]) + w[11] + S0 (w[3]);
       t2 = Sum0 (g) + Maj (g, h, a);
       b += t1;
       f  = t1 + t2;
 
       t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
       w[3] += S1 (w[1]) + w[12] + S0 (w[4]);
       t2 = Sum0 (f) + Maj (f, g, h);
       a += t1;
       e  = t1 + t2;
 
       t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
       w[4] += S1 (w[2]) + w[13] + S0 (w[5]);
       t2 = Sum0 (e) + Maj (e, f, g);
       h += t1;
       d  = t1 + t2;
 
       t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
       w[5] += S1 (w[3]) + w[14] + S0 (w[6]);
       t2 = Sum0 (d) + Maj (d, e, f);
       g += t1;
       c  = t1 + t2;
 
       t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
       w[6] += S1 (w[4]) + w[15] + S0 (w[7]);
       t2 = Sum0 (c) + Maj (c, d, e);
       f += t1;
       b  = t1 + t2;
 
       t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
       w[7] += S1 (w[5]) + w[0] + S0 (w[8]);
       t2 = Sum0 (b) + Maj (b, c, d);
       e += t1;
       a  = t1 + t2;
 
       t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
       w[8] += S1 (w[6]) + w[1] + S0 (w[9]);
       t2 = Sum0 (a) + Maj (a, b, c);
       d += t1;
       h  = t1 + t2;
 
       t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
       w[9] += S1 (w[7]) + w[2] + S0 (w[10]);
       t2 = Sum0 (h) + Maj (h, a, b);
       c += t1;
       g  = t1 + t2;
 
       t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
       w[10] += S1 (w[8]) + w[3] + S0 (w[11]);
       t2 = Sum0 (g) + Maj (g, h, a);
       b += t1;
       f  = t1 + t2;
 
       t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
       w[11] += S1 (w[9]) + w[4] + S0 (w[12]);
       t2 = Sum0 (f) + Maj (f, g, h);
       a += t1;
       e  = t1 + t2;
 
       t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
       w[12] += S1 (w[10]) + w[5] + S0 (w[13]);
       t2 = Sum0 (e) + Maj (e, f, g);
       h += t1;
       d  = t1 + t2;
 
       t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
       w[13] += S1 (w[11]) + w[6] + S0 (w[14]);
       t2 = Sum0 (d) + Maj (d, e, f);
       g += t1;
       c  = t1 + t2;
 
       t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
       w[14] += S1 (w[12]) + w[7] + S0 (w[15]);
       t2 = Sum0 (c) + Maj (c, d, e);
       f += t1;
       b  = t1 + t2;
 
       t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
       w[15] += S1 (w[13]) + w[8] + S0 (w[0]);
       t2 = Sum0 (b) + Maj (b, c, d);
       e += t1;
       a  = t1 + t2;
 
       t += 16;
 #endif
     }
 
   for (; t < 80; )
     {
       u64 t1, t2;
 
 #if 0 /* Not unrolled.  */
       t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
       t2 = Sum0 (a) + Maj (a, b, c);
       h = g;
       g = f;
       f = e;
       e = d + t1;
       d = c;
       c = b;
       b = a;
       a = t1 + t2;
       t++;
 #else /* Unrolled to interweave the chain variables.  */
       t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
       t2 = Sum0 (a) + Maj (a, b, c);
       d += t1;
       h  = t1 + t2;
 
       t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
       t2 = Sum0 (h) + Maj (h, a, b);
       c += t1;
       g  = t1 + t2;
 
       t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
       t2 = Sum0 (g) + Maj (g, h, a);
       b += t1;
       f  = t1 + t2;
 
       t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
       t2 = Sum0 (f) + Maj (f, g, h);
       a += t1;
       e  = t1 + t2;
 
       t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
       t2 = Sum0 (e) + Maj (e, f, g);
       h += t1;
       d  = t1 + t2;
 
       t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
       t2 = Sum0 (d) + Maj (d, e, f);
       g += t1;
       c  = t1 + t2;
 
       t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
       t2 = Sum0 (c) + Maj (c, d, e);
       f += t1;
       b  = t1 + t2;
 
       t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
       t2 = Sum0 (b) + Maj (b, c, d);
       e += t1;
       a  = t1 + t2;
 
       t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
       t2 = Sum0 (a) + Maj (a, b, c);
       d += t1;
       h  = t1 + t2;
 
       t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
       t2 = Sum0 (h) + Maj (h, a, b);
       c += t1;
       g  = t1 + t2;
 
       t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
       t2 = Sum0 (g) + Maj (g, h, a);
       b += t1;
       f  = t1 + t2;
 
       t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
       t2 = Sum0 (f) + Maj (f, g, h);
       a += t1;
       e  = t1 + t2;
 
       t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
       t2 = Sum0 (e) + Maj (e, f, g);
       h += t1;
       d  = t1 + t2;
 
       t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
       t2 = Sum0 (d) + Maj (d, e, f);
       g += t1;
       c  = t1 + t2;
 
       t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
       t2 = Sum0 (c) + Maj (c, d, e);
       f += t1;
       b  = t1 + t2;
 
       t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
       t2 = Sum0 (b) + Maj (b, c, d);
       e += t1;
       a  = t1 + t2;
 
       t += 16;
 #endif
     }
 
   /* Update chaining vars.  */
   hd->h0 += a;
   hd->h1 += b;
   hd->h2 += c;
   hd->h3 += d;
   hd->h4 += e;
   hd->h5 += f;
   hd->h6 += g;
   hd->h7 += h;
 
   return /* burn_stack */ (8 + 16) * sizeof(u64) + sizeof(u32) +
                           3 * sizeof(void*);
 }
 
 
 #ifdef USE_ARM_NEON_ASM
 void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
 					const unsigned char *data,
 					const u64 k[], size_t num_blks);
 #endif
 
 #ifdef USE_SSSE3
 unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
 					        void *state, size_t num_blks);
 #endif
 
 #ifdef USE_AVX
 unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data,
 					      void *state, size_t num_blks);
 #endif
 
 #ifdef USE_AVX2
 unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data,
 					       void *state, size_t num_blks);
 #endif
 
 
 static unsigned int
 transform (void *context, const unsigned char *data, size_t nblks)
 {
   SHA512_CONTEXT *ctx = context;
   unsigned int burn;
 
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, nblks)
            + 4 * sizeof(void*);
 #endif
 
 #ifdef USE_AVX
   if (ctx->use_avx)
     return _gcry_sha512_transform_amd64_avx (data, &ctx->state, nblks)
            + 4 * sizeof(void*);
 #endif
 
 #ifdef USE_SSSE3
   if (ctx->use_ssse3)
     return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, nblks)
            + 4 * sizeof(void*);
 #endif
 
 #ifdef USE_ARM_NEON_ASM
   if (ctx->use_neon)
     {
       _gcry_sha512_transform_armv7_neon (&ctx->state, data, k, nblks);
 
       /* _gcry_sha512_transform_armv7_neon does not store sensitive data
        * to stack.  */
       return /* no burn_stack */ 0;
     }
 #endif
 
   do
     {
       burn = transform_blk (&ctx->state, data) + 3 * sizeof(void*);
       data += 128;
     }
   while (--nblks);
 
   return burn;
 }
 
 
 /* The routine final terminates the computation and
  * returns the digest.
  * The handle is prepared for a new cycle, but adding bytes to the
  * handle will the destroy the returned buffer.
  * Returns: 64 bytes representing the digest.  When used for sha384,
  * we take the leftmost 48 of those bytes.
  */
 
 static void
 sha512_final (void *context)
 {
   SHA512_CONTEXT *hd = context;
   unsigned int stack_burn_depth;
   u64 t, th, msb, lsb;
   byte *p;
 
   _gcry_md_block_write (context, NULL, 0); /* flush */ ;
 
   t = hd->bctx.nblocks;
   /* if (sizeof t == sizeof hd->bctx.nblocks) */
   th = hd->bctx.nblocks_high;
   /* else */
   /*   th = hd->bctx.nblocks >> 64; In case we ever use u128  */
 
   /* multiply by 128 to make a byte count */
   lsb = t << 7;
   msb = (th << 7) | (t >> 57);
   /* add the count */
   t = lsb;
   if ((lsb += hd->bctx.count) < t)
     msb++;
   /* multiply by 8 to make a bit count */
   t = lsb;
   lsb <<= 3;
   msb <<= 3;
   msb |= t >> 61;
 
   if (hd->bctx.count < 112)
     {				/* enough room */
       hd->bctx.buf[hd->bctx.count++] = 0x80;	/* pad */
       while (hd->bctx.count < 112)
         hd->bctx.buf[hd->bctx.count++] = 0;	/* pad */
     }
   else
     {				/* need one extra block */
       hd->bctx.buf[hd->bctx.count++] = 0x80;	/* pad character */
       while (hd->bctx.count < 128)
         hd->bctx.buf[hd->bctx.count++] = 0;
       _gcry_md_block_write (context, NULL, 0); /* flush */ ;
       memset (hd->bctx.buf, 0, 112);	/* fill next block with zeroes */
     }
   /* append the 128 bit count */
   buf_put_be64(hd->bctx.buf + 112, msb);
   buf_put_be64(hd->bctx.buf + 120, lsb);
   stack_burn_depth = transform (hd, hd->bctx.buf, 1);
   _gcry_burn_stack (stack_burn_depth);
 
   p = hd->bctx.buf;
 #define X(a) do { *(u64*)p = be_bswap64(hd->state.h##a) ; p += 8; } while (0)
   X (0);
   X (1);
   X (2);
   X (3);
   X (4);
   X (5);
   /* Note that these last two chunks are included even for SHA384.
      We just ignore them. */
   X (6);
   X (7);
 #undef X
 }
 
 static byte *
 sha512_read (void *context)
 {
   SHA512_CONTEXT *hd = (SHA512_CONTEXT *) context;
   return hd->bctx.buf;
 }
 
 
 
 /*
      Self-test section.
  */
 
 
 static gpg_err_code_t
 selftests_sha384 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA384, 0,
      "abc", 3,
      "\xcb\x00\x75\x3f\x45\xa3\x5e\x8b\xb5\xa0\x3d\x69\x9a\xc6\x50\x07"
      "\x27\x2c\x32\xab\x0e\xde\xd1\x63\x1a\x8b\x60\x5a\x43\xff\x5b\xed"
      "\x80\x86\x07\x2b\xa1\xe7\xcc\x23\x58\xba\xec\xa1\x34\xc8\x25\xa7", 48);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA384, 0,
          "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
          "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
          "\x09\x33\x0C\x33\xF7\x11\x47\xE8\x3D\x19\x2F\xC7\x82\xCD\x1B\x47"
          "\x53\x11\x1B\x17\x3B\x3B\x05\xD2\x2F\xA0\x80\x86\xE3\xB0\xF7\x12"
          "\xFC\xC7\xC7\x1A\x55\x7E\x2D\xB9\x66\xC3\xE9\xFA\x91\x74\x60\x39",
          48);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA384, 1,
          NULL, 0,
          "\x9D\x0E\x18\x09\x71\x64\x74\xCB\x08\x6E\x83\x4E\x31\x0A\x4A\x1C"
          "\xED\x14\x9E\x9C\x00\xF2\x48\x52\x79\x72\xCE\xC5\x70\x4C\x2A\x5B"
          "\x07\xB8\xB3\xDC\x38\xEC\xC4\xEB\xAE\x97\xDD\xD8\x7F\x3D\x89\x85",
          48);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA384, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 static gpg_err_code_t
 selftests_sha512 (int extended, selftest_report_func_t report)
 {
   const char *what;
   const char *errtxt;
 
   what = "short string";
   errtxt = _gcry_hash_selftest_check_one
     (GCRY_MD_SHA512, 0,
      "abc", 3,
      "\xDD\xAF\x35\xA1\x93\x61\x7A\xBA\xCC\x41\x73\x49\xAE\x20\x41\x31"
      "\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A"
      "\x21\x92\x99\x2A\x27\x4F\xC1\xA8\x36\xBA\x3C\x23\xA3\xFE\xEB\xBD"
      "\x45\x4D\x44\x23\x64\x3C\xE8\x0E\x2A\x9A\xC9\x4F\xA5\x4C\xA4\x9F", 64);
   if (errtxt)
     goto failed;
 
   if (extended)
     {
       what = "long string";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA512, 0,
          "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
          "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
          "\x8E\x95\x9B\x75\xDA\xE3\x13\xDA\x8C\xF4\xF7\x28\x14\xFC\x14\x3F"
          "\x8F\x77\x79\xC6\xEB\x9F\x7F\xA1\x72\x99\xAE\xAD\xB6\x88\x90\x18"
          "\x50\x1D\x28\x9E\x49\x00\xF7\xE4\x33\x1B\x99\xDE\xC4\xB5\x43\x3A"
          "\xC7\xD3\x29\xEE\xB6\xDD\x26\x54\x5E\x96\xE5\x5B\x87\x4B\xE9\x09",
          64);
       if (errtxt)
         goto failed;
 
       what = "one million \"a\"";
       errtxt = _gcry_hash_selftest_check_one
         (GCRY_MD_SHA512, 1,
          NULL, 0,
          "\xE7\x18\x48\x3D\x0C\xE7\x69\x64\x4E\x2E\x42\xC7\xBC\x15\xB4\x63"
          "\x8E\x1F\x98\xB1\x3B\x20\x44\x28\x56\x32\xA8\x03\xAF\xA9\x73\xEB"
          "\xDE\x0F\xF2\x44\x87\x7E\xA6\x0A\x4C\xB0\x43\x2C\xE5\x77\xC3\x1B"
          "\xEB\x00\x9C\x5C\x2C\x49\xAA\x2E\x4E\xAD\xB2\x17\xAD\x8C\xC0\x9B",
          64);
       if (errtxt)
         goto failed;
     }
 
   return 0; /* Succeeded. */
 
  failed:
   if (report)
     report ("digest", GCRY_MD_SHA512, what, errtxt);
   return GPG_ERR_SELFTEST_FAILED;
 }
 
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
   gpg_err_code_t ec;
 
   switch (algo)
     {
     case GCRY_MD_SHA384:
       ec = selftests_sha384 (extended, report);
       break;
     case GCRY_MD_SHA512:
       ec = selftests_sha512 (extended, report);
       break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
 
     }
   return ec;
 }
 
 
 
 
 static byte sha512_asn[] =	/* Object ID is 2.16.840.1.101.3.4.2.3 */
   {
     0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
     0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, 0x05,
     0x00, 0x04, 0x40
   };
 
 static gcry_md_oid_spec_t oid_spec_sha512[] =
   {
     { "2.16.840.1.101.3.4.2.3" },
 
     /* PKCS#1 sha512WithRSAEncryption */
     { "1.2.840.113549.1.1.13" },
 
     { NULL }
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha512 =
   {
     GCRY_MD_SHA512, {0, 1},
     "SHA512", sha512_asn, DIM (sha512_asn), oid_spec_sha512, 64,
     sha512_init, _gcry_md_block_write, sha512_final, sha512_read,
     sizeof (SHA512_CONTEXT),
     run_selftests
   };
 
 static byte sha384_asn[] =	/* Object ID is 2.16.840.1.101.3.4.2.2 */
   {
     0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
     0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, 0x05,
     0x00, 0x04, 0x30
   };
 
 static gcry_md_oid_spec_t oid_spec_sha384[] =
   {
     { "2.16.840.1.101.3.4.2.2" },
 
     /* PKCS#1 sha384WithRSAEncryption */
     { "1.2.840.113549.1.1.12" },
 
     { NULL },
   };
 
 gcry_md_spec_t _gcry_digest_spec_sha384 =
   {
     GCRY_MD_SHA384, {0, 1},
     "SHA384", sha384_asn, DIM (sha384_asn), oid_spec_sha384, 48,
     sha384_init, _gcry_md_block_write, sha512_final, sha512_read,
     sizeof (SHA512_CONTEXT),
     run_selftests
   };