diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index 72b28b03..d75fe515 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -1,576 +1,577 @@
/* salsa20.c - Bernstein's Salsa20 cipher
* Copyright (C) 2012 Simon Josefsson, Niels Möller
* Copyright (C) 2013 g10 Code GmbH
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser general Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see .
*
* For a description of the algorithm, see:
* http://cr.yp.to/snuffle/spec.pdf
* http://cr.yp.to/snuffle/design.pdf
*/
/* The code is based on the code in Nettle
(git commit id 9d2d8ddaee35b91a4e1a32ae77cba04bea3480e7)
which in turn is based on
salsa20-ref.c version 20051118
D. J. Bernstein
Public domain.
*/
#include
#include
#include
#include
#include "types.h"
#include "g10lib.h"
#include "cipher.h"
#include "bufhelp.h"
/* USE_AMD64 indicates whether to compile with AMD64 code. */
#undef USE_AMD64
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
# define USE_AMD64 1
#endif
/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
#undef USE_ARM_NEON_ASM
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
# define USE_ARM_NEON_ASM 1
# endif
-#endif
+#endif /*ENABLE_NEON_SUPPORT*/
#define SALSA20_MIN_KEY_SIZE 16 /* Bytes. */
#define SALSA20_MAX_KEY_SIZE 32 /* Bytes. */
#define SALSA20_BLOCK_SIZE 64 /* Bytes. */
#define SALSA20_IV_SIZE 8 /* Bytes. */
#define SALSA20_INPUT_LENGTH 16 /* Bytes. */
/* Number of rounds. The standard uses 20 rounds. In any case the
number of rounds must be even. */
#define SALSA20_ROUNDS 20
#define SALSA20R12_ROUNDS 12
struct SALSA20_context_s;
typedef unsigned int (*salsa20_core_t) (u32 *dst, struct SALSA20_context_s *ctx,
unsigned int rounds);
typedef void (* salsa20_keysetup_t)(struct SALSA20_context_s *ctx,
const byte *key, int keylen);
typedef void (* salsa20_ivsetup_t)(struct SALSA20_context_s *ctx,
const byte *iv);
typedef struct SALSA20_context_s
{
/* Indices 1-4 and 11-14 holds the key (two identical copies for the
shorter key size), indices 0, 5, 10, 15 are constant, indices 6, 7
are the IV, and indices 8, 9 are the block counter:
C K K K
K C I I
B B C K
K K K C
*/
u32 input[SALSA20_INPUT_LENGTH];
u32 pad[SALSA20_INPUT_LENGTH];
unsigned int unused; /* bytes in the pad. */
#ifdef USE_ARM_NEON_ASM
int use_neon;
#endif
salsa20_keysetup_t keysetup;
salsa20_ivsetup_t ivsetup;
salsa20_core_t core;
} SALSA20_context_t;
/* The masking of the right shift is needed to allow n == 0 (using
just 32 - n and 64 - n results in undefined behaviour). Most uses
of these macros use a constant and non-zero rotation count. */
#define ROTL32(n,x) (((x)<<(n)) | ((x)>>((-(n)&31))))
#define LE_SWAP32(v) le_bswap32(v)
#define LE_READ_UINT32(p) buf_get_le32(p)
static void salsa20_setiv (void *context, const byte *iv, size_t ivlen);
static const char *selftest (void);
#ifdef USE_AMD64
/* AMD64 assembly implementations of Salsa20. */
void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits);
void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv);
unsigned int
_gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst,
size_t len, int rounds);
static void
salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
{
_gcry_salsa20_amd64_keysetup(ctx->input, key, keylen * 8);
}
static void
salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
{
_gcry_salsa20_amd64_ivsetup(ctx->input, iv);
}
static unsigned int
salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
{
memset(dst, 0, SALSA20_BLOCK_SIZE);
return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds);
}
#else /* USE_AMD64 */
#if 0
# define SALSA20_CORE_DEBUG(i) do { \
unsigned debug_j; \
for (debug_j = 0; debug_j < 16; debug_j++) \
{ \
if (debug_j == 0) \
fprintf(stderr, "%2d:", (i)); \
else if (debug_j % 4 == 0) \
fprintf(stderr, "\n "); \
fprintf(stderr, " %8x", pad[debug_j]); \
} \
fprintf(stderr, "\n"); \
} while (0)
#else
# define SALSA20_CORE_DEBUG(i)
#endif
#define QROUND(x0, x1, x2, x3) \
do { \
x1 ^= ROTL32 ( 7, x0 + x3); \
x2 ^= ROTL32 ( 9, x1 + x0); \
x3 ^= ROTL32 (13, x2 + x1); \
x0 ^= ROTL32 (18, x3 + x2); \
} while(0)
static unsigned int
salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned rounds)
{
u32 pad[SALSA20_INPUT_LENGTH], *src = ctx->input;
unsigned int i;
memcpy (pad, src, sizeof(pad));
for (i = 0; i < rounds; i += 2)
{
SALSA20_CORE_DEBUG (i);
QROUND (pad[0], pad[4], pad[8], pad[12]);
QROUND (pad[5], pad[9], pad[13], pad[1] );
QROUND (pad[10], pad[14], pad[2], pad[6] );
QROUND (pad[15], pad[3], pad[7], pad[11]);
SALSA20_CORE_DEBUG (i+1);
QROUND (pad[0], pad[1], pad[2], pad[3] );
QROUND (pad[5], pad[6], pad[7], pad[4] );
QROUND (pad[10], pad[11], pad[8], pad[9] );
QROUND (pad[15], pad[12], pad[13], pad[14]);
}
SALSA20_CORE_DEBUG (i);
for (i = 0; i < SALSA20_INPUT_LENGTH; i++)
{
u32 t = pad[i] + src[i];
dst[i] = LE_SWAP32 (t);
}
/* Update counter. */
if (!++src[8])
src[9]++;
/* burn_stack */
return ( 3*sizeof (void*) \
+ 2*sizeof (void*) \
+ 64 \
+ sizeof (unsigned int) \
+ sizeof (u32) );
}
#undef QROUND
#undef SALSA20_CORE_DEBUG
static void
salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
{
/* These constants are the little endian encoding of the string
"expand 32-byte k". For the 128 bit variant, the "32" in that
string will be fixed up to "16". */
ctx->input[0] = 0x61707865; /* "apxe" */
ctx->input[5] = 0x3320646e; /* "3 dn" */
ctx->input[10] = 0x79622d32; /* "yb-2" */
ctx->input[15] = 0x6b206574; /* "k et" */
ctx->input[1] = LE_READ_UINT32(key + 0);
ctx->input[2] = LE_READ_UINT32(key + 4);
ctx->input[3] = LE_READ_UINT32(key + 8);
ctx->input[4] = LE_READ_UINT32(key + 12);
if (keylen == SALSA20_MAX_KEY_SIZE) /* 256 bits */
{
ctx->input[11] = LE_READ_UINT32(key + 16);
ctx->input[12] = LE_READ_UINT32(key + 20);
ctx->input[13] = LE_READ_UINT32(key + 24);
ctx->input[14] = LE_READ_UINT32(key + 28);
}
else /* 128 bits */
{
ctx->input[11] = ctx->input[1];
ctx->input[12] = ctx->input[2];
ctx->input[13] = ctx->input[3];
ctx->input[14] = ctx->input[4];
ctx->input[5] -= 0x02000000; /* Change to "1 dn". */
ctx->input[10] += 0x00000004; /* Change to "yb-6". */
}
}
static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
{
ctx->input[6] = LE_READ_UINT32(iv + 0);
ctx->input[7] = LE_READ_UINT32(iv + 4);
/* Reset the block counter. */
ctx->input[8] = 0;
ctx->input[9] = 0;
}
#endif /*!USE_AMD64*/
#ifdef USE_ARM_NEON_ASM
/* ARM NEON implementation of Salsa20. */
unsigned int
_gcry_arm_neon_salsa20_encrypt(void *c, const void *m, unsigned int nblks,
void *k, unsigned int rounds);
static unsigned int
salsa20_core_neon (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
{
return _gcry_arm_neon_salsa20_encrypt(dst, NULL, 1, ctx->input, rounds);
}
static void salsa20_ivsetup_neon(SALSA20_context_t *ctx, const byte *iv)
{
memcpy(ctx->input + 8, iv, 8);
/* Reset the block counter. */
memset(ctx->input + 10, 0, 8);
}
static void
salsa20_keysetup_neon(SALSA20_context_t *ctx, const byte *key, int klen)
{
static const unsigned char sigma32[16] = "expand 32-byte k";
static const unsigned char sigma16[16] = "expand 16-byte k";
if (klen == 16)
{
memcpy (ctx->input, key, 16);
memcpy (ctx->input + 4, key, 16); /* Duplicate 128-bit key. */
memcpy (ctx->input + 12, sigma16, 16);
}
else
{
/* 32-byte key */
memcpy (ctx->input, key, 32);
memcpy (ctx->input + 12, sigma32, 16);
}
}
#endif /*USE_ARM_NEON_ASM*/
static gcry_err_code_t
salsa20_do_setkey (SALSA20_context_t *ctx,
const byte *key, unsigned int keylen)
{
static int initialized;
static const char *selftest_failed;
if (!initialized )
{
initialized = 1;
selftest_failed = selftest ();
if (selftest_failed)
log_error ("SALSA20 selftest failed (%s)\n", selftest_failed );
}
if (selftest_failed)
return GPG_ERR_SELFTEST_FAILED;
if (keylen != SALSA20_MIN_KEY_SIZE
&& keylen != SALSA20_MAX_KEY_SIZE)
return GPG_ERR_INV_KEYLEN;
/* Default ops. */
ctx->keysetup = salsa20_keysetup;
ctx->ivsetup = salsa20_ivsetup;
ctx->core = salsa20_core;
#ifdef USE_ARM_NEON_ASM
ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
if (ctx->use_neon)
{
/* Use ARM NEON ops instead. */
ctx->keysetup = salsa20_keysetup_neon;
ctx->ivsetup = salsa20_ivsetup_neon;
ctx->core = salsa20_core_neon;
}
#endif
ctx->keysetup (ctx, key, keylen);
/* We default to a zero nonce. */
salsa20_setiv (ctx, NULL, 0);
return 0;
}
static gcry_err_code_t
salsa20_setkey (void *context, const byte *key, unsigned int keylen)
{
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
gcry_err_code_t rc = salsa20_do_setkey (ctx, key, keylen);
_gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
return rc;
}
static void
salsa20_setiv (void *context, const byte *iv, size_t ivlen)
{
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
byte tmp[SALSA20_IV_SIZE];
if (iv && ivlen != SALSA20_IV_SIZE)
log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", (u32)ivlen);
if (!iv || ivlen != SALSA20_IV_SIZE)
memset (tmp, 0, sizeof(tmp));
else
memcpy (tmp, iv, SALSA20_IV_SIZE);
ctx->ivsetup (ctx, tmp);
/* Reset the unused pad bytes counter. */
ctx->unused = 0;
wipememory (tmp, sizeof(tmp));
}
/* Note: This function requires LENGTH > 0. */
static void
salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
byte *outbuf, const byte *inbuf,
size_t length, unsigned rounds)
{
unsigned int nburn, burn = 0;
if (ctx->unused)
{
unsigned char *p = (void*)ctx->pad;
size_t n;
gcry_assert (ctx->unused < SALSA20_BLOCK_SIZE);
n = ctx->unused;
if (n > length)
n = length;
buf_xor (outbuf, inbuf, p + SALSA20_BLOCK_SIZE - ctx->unused, n);
length -= n;
outbuf += n;
inbuf += n;
ctx->unused -= n;
if (!length)
return;
gcry_assert (!ctx->unused);
}
#ifdef USE_AMD64
if (length >= SALSA20_BLOCK_SIZE)
{
size_t nblocks = length / SALSA20_BLOCK_SIZE;
burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf,
nblocks, rounds);
length -= SALSA20_BLOCK_SIZE * nblocks;
outbuf += SALSA20_BLOCK_SIZE * nblocks;
inbuf += SALSA20_BLOCK_SIZE * nblocks;
}
#endif
#ifdef USE_ARM_NEON_ASM
if (ctx->use_neon && length >= SALSA20_BLOCK_SIZE)
{
unsigned int nblocks = length / SALSA20_BLOCK_SIZE;
_gcry_arm_neon_salsa20_encrypt (outbuf, inbuf, nblocks, ctx->input,
rounds);
length -= SALSA20_BLOCK_SIZE * nblocks;
outbuf += SALSA20_BLOCK_SIZE * nblocks;
inbuf += SALSA20_BLOCK_SIZE * nblocks;
}
#endif
while (length > 0)
{
/* Create the next pad and bump the block counter. Note that it
is the user's duty to change to another nonce not later than
after 2^70 processed bytes. */
nburn = ctx->core (ctx->pad, ctx, rounds);
burn = nburn > burn ? nburn : burn;
if (length <= SALSA20_BLOCK_SIZE)
{
buf_xor (outbuf, inbuf, ctx->pad, length);
ctx->unused = SALSA20_BLOCK_SIZE - length;
break;
}
buf_xor (outbuf, inbuf, ctx->pad, SALSA20_BLOCK_SIZE);
length -= SALSA20_BLOCK_SIZE;
outbuf += SALSA20_BLOCK_SIZE;
inbuf += SALSA20_BLOCK_SIZE;
}
_gcry_burn_stack (burn);
}
static void
salsa20_encrypt_stream (void *context,
byte *outbuf, const byte *inbuf, size_t length)
{
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
if (length)
salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS);
}
static void
salsa20r12_encrypt_stream (void *context,
byte *outbuf, const byte *inbuf, size_t length)
{
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
if (length)
salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS);
}
static const char*
selftest (void)
{
SALSA20_context_t ctx;
byte scratch[8+1];
byte buf[256+64+4];
int i;
static byte key_1[] =
{ 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
static const byte nonce_1[] =
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
static const byte plaintext_1[] =
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
static const byte ciphertext_1[] =
{ 0xE3, 0xBE, 0x8F, 0xDD, 0x8B, 0xEC, 0xA2, 0xE3};
salsa20_setkey (&ctx, key_1, sizeof key_1);
salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
scratch[8] = 0;
salsa20_encrypt_stream (&ctx, scratch, plaintext_1, sizeof plaintext_1);
if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
return "Salsa20 encryption test 1 failed.";
if (scratch[8])
return "Salsa20 wrote too much.";
salsa20_setkey( &ctx, key_1, sizeof(key_1));
salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
salsa20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1);
if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
return "Salsa20 decryption test 1 failed.";
for (i = 0; i < sizeof buf; i++)
buf[i] = i;
salsa20_setkey (&ctx, key_1, sizeof key_1);
salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
/*encrypt*/
salsa20_encrypt_stream (&ctx, buf, buf, sizeof buf);
/*decrypt*/
salsa20_setkey (&ctx, key_1, sizeof key_1);
salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
salsa20_encrypt_stream (&ctx, buf, buf, 1);
salsa20_encrypt_stream (&ctx, buf+1, buf+1, (sizeof buf)-1-1);
salsa20_encrypt_stream (&ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1);
for (i = 0; i < sizeof buf; i++)
if (buf[i] != (byte)i)
return "Salsa20 encryption test 2 failed.";
return NULL;
}
gcry_cipher_spec_t _gcry_cipher_spec_salsa20 =
{
GCRY_CIPHER_SALSA20,
{0, 0}, /* flags */
"SALSA20", /* name */
NULL, /* aliases */
NULL, /* oids */
1, /* blocksize in bytes. */
SALSA20_MAX_KEY_SIZE*8, /* standard key length in bits. */
sizeof (SALSA20_context_t),
salsa20_setkey,
NULL,
NULL,
salsa20_encrypt_stream,
salsa20_encrypt_stream,
NULL,
NULL,
salsa20_setiv
};
gcry_cipher_spec_t _gcry_cipher_spec_salsa20r12 =
{
GCRY_CIPHER_SALSA20R12,
{0, 0}, /* flags */
"SALSA20R12", /* name */
NULL, /* aliases */
NULL, /* oids */
1, /* blocksize in bytes. */
SALSA20_MAX_KEY_SIZE*8, /* standard key length in bits. */
sizeof (SALSA20_context_t),
salsa20_setkey,
NULL,
NULL,
salsa20r12_encrypt_stream,
salsa20r12_encrypt_stream,
NULL,
NULL,
salsa20_setiv
};
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 8e647d4f..0be49da4 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1,1324 +1,1324 @@
/* serpent.c - Implementation of the Serpent encryption algorithm.
* Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser general Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
* 02111-1307, USA.
*/
#include
#include
#include
#include "types.h"
#include "g10lib.h"
#include "cipher.h"
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher-selftest.h"
/* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
#undef USE_SSE2
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
# define USE_SSE2 1
#endif
/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
#undef USE_AVX2
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
# if defined(ENABLE_AVX2_SUPPORT)
# define USE_AVX2 1
# endif
#endif
/* USE_NEON indicates whether to enable ARM NEON assembly code. */
#undef USE_NEON
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
# define USE_NEON 1
# endif
-#endif
-
+#endif /*ENABLE_NEON_SUPPORT*/
/* Number of rounds per Serpent encrypt/decrypt operation. */
#define ROUNDS 32
/* Magic number, used during generating of the subkeys. */
#define PHI 0x9E3779B9
/* Serpent works on 128 bit blocks. */
typedef u32 serpent_block_t[4];
/* Serpent key, provided by the user. If the original key is shorter
than 256 bits, it is padded. */
typedef u32 serpent_key_t[8];
/* The key schedule consists of 33 128 bit subkeys. */
typedef u32 serpent_subkeys_t[ROUNDS + 1][4];
/* A Serpent context. */
typedef struct serpent_context
{
serpent_subkeys_t keys; /* Generated subkeys. */
#ifdef USE_AVX2
int use_avx2;
#endif
#ifdef USE_NEON
int use_neon;
#endif
} serpent_context_t;
#ifdef USE_SSE2
/* Assembler implementations of Serpent using SSE2. Process 8 block in
parallel.
*/
extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *ctr);
extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
#endif
#ifdef USE_AVX2
/* Assembler implementations of Serpent using SSE2. Process 16 block in
parallel.
*/
extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *ctr);
extern void _gcry_serpent_avx2_cbc_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
#endif
#ifdef USE_NEON
/* Assembler implementations of Serpent using ARM NEON. Process 8 block in
parallel.
*/
extern void _gcry_serpent_neon_ctr_enc(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *ctr);
extern void _gcry_serpent_neon_cbc_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
#endif
/* A prototype. */
static const char *serpent_test (void);
/*
* These are the S-Boxes of Serpent from following research paper.
*
* D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
* (New York, New York, USA), p. 317–329, National Institute of Standards and
* Technology, 2000.
*
* Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
*
*/
#define SBOX0(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r3 ^= r0; r4 = r1; \
r1 &= r3; r4 ^= r2; \
r1 ^= r0; r0 |= r3; \
r0 ^= r4; r4 ^= r3; \
r3 ^= r2; r2 |= r1; \
r2 ^= r4; r4 = ~r4; \
r4 |= r1; r1 ^= r3; \
r1 ^= r4; r3 |= r0; \
r1 ^= r3; r4 ^= r3; \
\
w = r1; x = r4; y = r2; z = r0; \
}
#define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r2 = ~r2; r4 = r1; \
r1 |= r0; r4 = ~r4; \
r1 ^= r2; r2 |= r4; \
r1 ^= r3; r0 ^= r4; \
r2 ^= r0; r0 &= r3; \
r4 ^= r0; r0 |= r1; \
r0 ^= r2; r3 ^= r4; \
r2 ^= r1; r3 ^= r0; \
r3 ^= r1; \
r2 &= r3; \
r4 ^= r2; \
\
w = r0; x = r4; y = r1; z = r3; \
}
#define SBOX1(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r0 = ~r0; r2 = ~r2; \
r4 = r0; r0 &= r1; \
r2 ^= r0; r0 |= r3; \
r3 ^= r2; r1 ^= r0; \
r0 ^= r4; r4 |= r1; \
r1 ^= r3; r2 |= r0; \
r2 &= r4; r0 ^= r1; \
r1 &= r2; \
r1 ^= r0; r0 &= r2; \
r0 ^= r4; \
\
w = r2; x = r0; y = r3; z = r1; \
}
#define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r4 = r1; r1 ^= r3; \
r3 &= r1; r4 ^= r2; \
r3 ^= r0; r0 |= r1; \
r2 ^= r3; r0 ^= r4; \
r0 |= r2; r1 ^= r3; \
r0 ^= r1; r1 |= r3; \
r1 ^= r0; r4 = ~r4; \
r4 ^= r1; r1 |= r0; \
r1 ^= r0; \
r1 |= r4; \
r3 ^= r1; \
\
w = r4; x = r0; y = r3; z = r2; \
}
#define SBOX2(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r4 = r0; r0 &= r2; \
r0 ^= r3; r2 ^= r1; \
r2 ^= r0; r3 |= r4; \
r3 ^= r1; r4 ^= r2; \
r1 = r3; r3 |= r4; \
r3 ^= r0; r0 &= r1; \
r4 ^= r0; r1 ^= r3; \
r1 ^= r4; r4 = ~r4; \
\
w = r2; x = r3; y = r1; z = r4; \
}
#define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r2 ^= r3; r3 ^= r0; \
r4 = r3; r3 &= r2; \
r3 ^= r1; r1 |= r2; \
r1 ^= r4; r4 &= r3; \
r2 ^= r3; r4 &= r0; \
r4 ^= r2; r2 &= r1; \
r2 |= r0; r3 = ~r3; \
r2 ^= r3; r0 ^= r3; \
r0 &= r1; r3 ^= r4; \
r3 ^= r0; \
\
w = r1; x = r4; y = r2; z = r3; \
}
#define SBOX3(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r4 = r0; r0 |= r3; \
r3 ^= r1; r1 &= r4; \
r4 ^= r2; r2 ^= r3; \
r3 &= r0; r4 |= r1; \
r3 ^= r4; r0 ^= r1; \
r4 &= r0; r1 ^= r3; \
r4 ^= r2; r1 |= r0; \
r1 ^= r2; r0 ^= r3; \
r2 = r1; r1 |= r3; \
r1 ^= r0; \
\
w = r1; x = r2; y = r3; z = r4; \
}
#define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r4 = r2; r2 ^= r1; \
r0 ^= r2; r4 &= r2; \
r4 ^= r0; r0 &= r1; \
r1 ^= r3; r3 |= r4; \
r2 ^= r3; r0 ^= r3; \
r1 ^= r4; r3 &= r2; \
r3 ^= r1; r1 ^= r0; \
r1 |= r2; r0 ^= r3; \
r1 ^= r4; \
r0 ^= r1; \
\
w = r2; x = r1; y = r3; z = r0; \
}
#define SBOX4(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r1 ^= r3; r3 = ~r3; \
r2 ^= r3; r3 ^= r0; \
r4 = r1; r1 &= r3; \
r1 ^= r2; r4 ^= r3; \
r0 ^= r4; r2 &= r4; \
r2 ^= r0; r0 &= r1; \
r3 ^= r0; r4 |= r1; \
r4 ^= r0; r0 |= r3; \
r0 ^= r2; r2 &= r3; \
r0 = ~r0; r4 ^= r2; \
\
w = r1; x = r4; y = r0; z = r3; \
}
#define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r4 = r2; r2 &= r3; \
r2 ^= r1; r1 |= r3; \
r1 &= r0; r4 ^= r2; \
r4 ^= r1; r1 &= r2; \
r0 = ~r0; r3 ^= r4; \
r1 ^= r3; r3 &= r0; \
r3 ^= r2; r0 ^= r1; \
r2 &= r0; r3 ^= r0; \
r2 ^= r4; \
r2 |= r3; r3 ^= r0; \
r2 ^= r1; \
\
w = r0; x = r3; y = r2; z = r4; \
}
#define SBOX5(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r0 ^= r1; r1 ^= r3; \
r3 = ~r3; r4 = r1; \
r1 &= r0; r2 ^= r3; \
r1 ^= r2; r2 |= r4; \
r4 ^= r3; r3 &= r1; \
r3 ^= r0; r4 ^= r1; \
r4 ^= r2; r2 ^= r0; \
r0 &= r3; r2 = ~r2; \
r0 ^= r4; r4 |= r3; \
r2 ^= r4; \
\
w = r1; x = r3; y = r0; z = r2; \
}
#define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r1 = ~r1; r4 = r3; \
r2 ^= r1; r3 |= r0; \
r3 ^= r2; r2 |= r1; \
r2 &= r0; r4 ^= r3; \
r2 ^= r4; r4 |= r0; \
r4 ^= r1; r1 &= r2; \
r1 ^= r3; r4 ^= r2; \
r3 &= r4; r4 ^= r1; \
r3 ^= r4; r4 = ~r4; \
r3 ^= r0; \
\
w = r1; x = r4; y = r3; z = r2; \
}
#define SBOX6(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r2 = ~r2; r4 = r3; \
r3 &= r0; r0 ^= r4; \
r3 ^= r2; r2 |= r4; \
r1 ^= r3; r2 ^= r0; \
r0 |= r1; r2 ^= r1; \
r4 ^= r0; r0 |= r3; \
r0 ^= r2; r4 ^= r3; \
r4 ^= r0; r3 = ~r3; \
r2 &= r4; \
r2 ^= r3; \
\
w = r0; x = r1; y = r4; z = r2; \
}
#define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r0 ^= r2; r4 = r2; \
r2 &= r0; r4 ^= r3; \
r2 = ~r2; r3 ^= r1; \
r2 ^= r3; r4 |= r0; \
r0 ^= r2; r3 ^= r4; \
r4 ^= r1; r1 &= r3; \
r1 ^= r0; r0 ^= r3; \
r0 |= r2; r3 ^= r1; \
r4 ^= r0; \
\
w = r1; x = r2; y = r4; z = r3; \
}
#define SBOX7(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r4 = r1; r1 |= r2; \
r1 ^= r3; r4 ^= r2; \
r2 ^= r1; r3 |= r4; \
r3 &= r0; r4 ^= r2; \
r3 ^= r1; r1 |= r4; \
r1 ^= r0; r0 |= r4; \
r0 ^= r2; r1 ^= r4; \
r2 ^= r1; r1 &= r0; \
r1 ^= r4; r2 = ~r2; \
r2 |= r0; \
r4 ^= r2; \
\
w = r4; x = r3; y = r1; z = r0; \
}
#define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
{ \
u32 r4; \
\
r4 = r2; r2 ^= r0; \
r0 &= r3; r4 |= r3; \
r2 = ~r2; r3 ^= r1; \
r1 |= r0; r0 ^= r2; \
r2 &= r4; r3 &= r4; \
r1 ^= r2; r2 ^= r0; \
r0 |= r2; r4 ^= r1; \
r0 ^= r3; r3 ^= r4; \
r4 |= r0; r3 ^= r2; \
r4 ^= r2; \
\
w = r3; x = r0; y = r1; z = r4; \
}
/* XOR BLOCK1 into BLOCK0. */
#define BLOCK_XOR(block0, block1) \
{ \
block0[0] ^= block1[0]; \
block0[1] ^= block1[1]; \
block0[2] ^= block1[2]; \
block0[3] ^= block1[3]; \
}
/* Copy BLOCK_SRC to BLOCK_DST. */
#define BLOCK_COPY(block_dst, block_src) \
{ \
block_dst[0] = block_src[0]; \
block_dst[1] = block_src[1]; \
block_dst[2] = block_src[2]; \
block_dst[3] = block_src[3]; \
}
/* Apply SBOX number WHICH to to the block found in ARRAY0, writing
the output to the block found in ARRAY1. */
#define SBOX(which, array0, array1) \
SBOX##which (array0[0], array0[1], array0[2], array0[3], \
array1[0], array1[1], array1[2], array1[3]);
/* Apply inverse SBOX number WHICH to to the block found in ARRAY0, writing
the output to the block found in ARRAY1. */
#define SBOX_INVERSE(which, array0, array1) \
SBOX##which##_INVERSE (array0[0], array0[1], array0[2], array0[3], \
array1[0], array1[1], array1[2], array1[3]);
/* Apply the linear transformation to BLOCK. */
#define LINEAR_TRANSFORMATION(block) \
{ \
block[0] = rol (block[0], 13); \
block[2] = rol (block[2], 3); \
block[1] = block[1] ^ block[0] ^ block[2]; \
block[3] = block[3] ^ block[2] ^ (block[0] << 3); \
block[1] = rol (block[1], 1); \
block[3] = rol (block[3], 7); \
block[0] = block[0] ^ block[1] ^ block[3]; \
block[2] = block[2] ^ block[3] ^ (block[1] << 7); \
block[0] = rol (block[0], 5); \
block[2] = rol (block[2], 22); \
}
/* Apply the inverse linear transformation to BLOCK. */
#define LINEAR_TRANSFORMATION_INVERSE(block) \
{ \
block[2] = ror (block[2], 22); \
block[0] = ror (block[0] , 5); \
block[2] = block[2] ^ block[3] ^ (block[1] << 7); \
block[0] = block[0] ^ block[1] ^ block[3]; \
block[3] = ror (block[3], 7); \
block[1] = ror (block[1], 1); \
block[3] = block[3] ^ block[2] ^ (block[0] << 3); \
block[1] = block[1] ^ block[0] ^ block[2]; \
block[2] = ror (block[2], 3); \
block[0] = ror (block[0], 13); \
}
/* Apply a Serpent round to BLOCK, using the SBOX number WHICH and the
subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary storage.
This macro increments `round'. */
#define ROUND(which, subkeys, block, block_tmp) \
{ \
BLOCK_XOR (block, subkeys[round]); \
round++; \
SBOX (which, block, block_tmp); \
LINEAR_TRANSFORMATION (block_tmp); \
BLOCK_COPY (block, block_tmp); \
}
/* Apply the last Serpent round to BLOCK, using the SBOX number WHICH
and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary
storage. The result will be stored in BLOCK_TMP. This macro
increments `round'. */
#define ROUND_LAST(which, subkeys, block, block_tmp) \
{ \
BLOCK_XOR (block, subkeys[round]); \
round++; \
SBOX (which, block, block_tmp); \
BLOCK_XOR (block_tmp, subkeys[round]); \
round++; \
}
/* Apply an inverse Serpent round to BLOCK, using the SBOX number
WHICH and the subkeys contained in SUBKEYS. Use BLOCK_TMP as
temporary storage. This macro increments `round'. */
#define ROUND_INVERSE(which, subkey, block, block_tmp) \
{ \
LINEAR_TRANSFORMATION_INVERSE (block); \
SBOX_INVERSE (which, block, block_tmp); \
BLOCK_XOR (block_tmp, subkey[round]); \
round--; \
BLOCK_COPY (block, block_tmp); \
}
/* Apply the first Serpent round to BLOCK, using the SBOX number WHICH
and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary
storage. The result will be stored in BLOCK_TMP. This macro
increments `round'. */
#define ROUND_FIRST_INVERSE(which, subkeys, block, block_tmp) \
{ \
BLOCK_XOR (block, subkeys[round]); \
round--; \
SBOX_INVERSE (which, block, block_tmp); \
BLOCK_XOR (block_tmp, subkeys[round]); \
round--; \
}
/* Convert the user provided key KEY of KEY_LENGTH bytes into the
internally used format. */
static void
serpent_key_prepare (const byte *key, unsigned int key_length,
serpent_key_t key_prepared)
{
int i;
/* Copy key. */
key_length /= 4;
for (i = 0; i < key_length; i++)
key_prepared[i] = buf_get_le32 (key + i * 4);
if (i < 8)
{
/* Key must be padded according to the Serpent
specification. */
key_prepared[i] = 0x00000001;
for (i++; i < 8; i++)
key_prepared[i] = 0;
}
}
/* Derive the 33 subkeys from KEY and store them in SUBKEYS. */
static void
serpent_subkeys_generate (serpent_key_t key, serpent_subkeys_t subkeys)
{
u32 w[8]; /* The `prekey'. */
u32 ws[4];
u32 wt[4];
/* Initialize with key values. */
w[0] = key[0];
w[1] = key[1];
w[2] = key[2];
w[3] = key[3];
w[4] = key[4];
w[5] = key[5];
w[6] = key[6];
w[7] = key[7];
/* Expand to intermediate key using the affine recurrence. */
#define EXPAND_KEY4(wo, r) \
wo[0] = w[(r+0)%8] = \
rol (w[(r+0)%8] ^ w[(r+3)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ PHI ^ (r+0), 11); \
wo[1] = w[(r+1)%8] = \
rol (w[(r+1)%8] ^ w[(r+4)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ PHI ^ (r+1), 11); \
wo[2] = w[(r+2)%8] = \
rol (w[(r+2)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ w[(r+1)%8] ^ PHI ^ (r+2), 11); \
wo[3] = w[(r+3)%8] = \
rol (w[(r+3)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ w[(r+2)%8] ^ PHI ^ (r+3), 11);
#define EXPAND_KEY(r) \
EXPAND_KEY4(ws, (r)); \
EXPAND_KEY4(wt, (r + 4));
/* Calculate subkeys via S-Boxes, in bitslice mode. */
EXPAND_KEY (0); SBOX (3, ws, subkeys[0]); SBOX (2, wt, subkeys[1]);
EXPAND_KEY (8); SBOX (1, ws, subkeys[2]); SBOX (0, wt, subkeys[3]);
EXPAND_KEY (16); SBOX (7, ws, subkeys[4]); SBOX (6, wt, subkeys[5]);
EXPAND_KEY (24); SBOX (5, ws, subkeys[6]); SBOX (4, wt, subkeys[7]);
EXPAND_KEY (32); SBOX (3, ws, subkeys[8]); SBOX (2, wt, subkeys[9]);
EXPAND_KEY (40); SBOX (1, ws, subkeys[10]); SBOX (0, wt, subkeys[11]);
EXPAND_KEY (48); SBOX (7, ws, subkeys[12]); SBOX (6, wt, subkeys[13]);
EXPAND_KEY (56); SBOX (5, ws, subkeys[14]); SBOX (4, wt, subkeys[15]);
EXPAND_KEY (64); SBOX (3, ws, subkeys[16]); SBOX (2, wt, subkeys[17]);
EXPAND_KEY (72); SBOX (1, ws, subkeys[18]); SBOX (0, wt, subkeys[19]);
EXPAND_KEY (80); SBOX (7, ws, subkeys[20]); SBOX (6, wt, subkeys[21]);
EXPAND_KEY (88); SBOX (5, ws, subkeys[22]); SBOX (4, wt, subkeys[23]);
EXPAND_KEY (96); SBOX (3, ws, subkeys[24]); SBOX (2, wt, subkeys[25]);
EXPAND_KEY (104); SBOX (1, ws, subkeys[26]); SBOX (0, wt, subkeys[27]);
EXPAND_KEY (112); SBOX (7, ws, subkeys[28]); SBOX (6, wt, subkeys[29]);
EXPAND_KEY (120); SBOX (5, ws, subkeys[30]); SBOX (4, wt, subkeys[31]);
EXPAND_KEY4 (ws, 128); SBOX (3, ws, subkeys[32]);
wipememory (ws, sizeof (ws));
wipememory (wt, sizeof (wt));
wipememory (w, sizeof (w));
}
/* Initialize CONTEXT with the key KEY of KEY_LENGTH bits. */
static void
serpent_setkey_internal (serpent_context_t *context,
const byte *key, unsigned int key_length)
{
serpent_key_t key_prepared;
serpent_key_prepare (key, key_length, key_prepared);
serpent_subkeys_generate (key_prepared, context->keys);
#ifdef USE_AVX2
context->use_avx2 = 0;
if ((_gcry_get_hw_features () & HWF_INTEL_AVX2))
{
context->use_avx2 = 1;
}
#endif
#ifdef USE_NEON
context->use_neon = 0;
if ((_gcry_get_hw_features () & HWF_ARM_NEON))
{
context->use_neon = 1;
}
#endif
wipememory (key_prepared, sizeof(key_prepared));
}
/* Initialize CTX with the key KEY of KEY_LENGTH bytes. */
static gcry_err_code_t
serpent_setkey (void *ctx,
const byte *key, unsigned int key_length)
{
serpent_context_t *context = ctx;
static const char *serpent_test_ret;
static int serpent_init_done;
gcry_err_code_t ret = GPG_ERR_NO_ERROR;
if (! serpent_init_done)
{
/* Execute a self-test the first time, Serpent is used. */
serpent_init_done = 1;
serpent_test_ret = serpent_test ();
if (serpent_test_ret)
log_error ("Serpent test failure: %s\n", serpent_test_ret);
}
if (serpent_test_ret)
ret = GPG_ERR_SELFTEST_FAILED;
else
serpent_setkey_internal (context, key, key_length);
return ret;
}
static void
serpent_encrypt_internal (serpent_context_t *context,
const byte *input, byte *output)
{
serpent_block_t b, b_next;
int round = 0;
b[0] = buf_get_le32 (input + 0);
b[1] = buf_get_le32 (input + 4);
b[2] = buf_get_le32 (input + 8);
b[3] = buf_get_le32 (input + 12);
ROUND (0, context->keys, b, b_next);
ROUND (1, context->keys, b, b_next);
ROUND (2, context->keys, b, b_next);
ROUND (3, context->keys, b, b_next);
ROUND (4, context->keys, b, b_next);
ROUND (5, context->keys, b, b_next);
ROUND (6, context->keys, b, b_next);
ROUND (7, context->keys, b, b_next);
ROUND (0, context->keys, b, b_next);
ROUND (1, context->keys, b, b_next);
ROUND (2, context->keys, b, b_next);
ROUND (3, context->keys, b, b_next);
ROUND (4, context->keys, b, b_next);
ROUND (5, context->keys, b, b_next);
ROUND (6, context->keys, b, b_next);
ROUND (7, context->keys, b, b_next);
ROUND (0, context->keys, b, b_next);
ROUND (1, context->keys, b, b_next);
ROUND (2, context->keys, b, b_next);
ROUND (3, context->keys, b, b_next);
ROUND (4, context->keys, b, b_next);
ROUND (5, context->keys, b, b_next);
ROUND (6, context->keys, b, b_next);
ROUND (7, context->keys, b, b_next);
ROUND (0, context->keys, b, b_next);
ROUND (1, context->keys, b, b_next);
ROUND (2, context->keys, b, b_next);
ROUND (3, context->keys, b, b_next);
ROUND (4, context->keys, b, b_next);
ROUND (5, context->keys, b, b_next);
ROUND (6, context->keys, b, b_next);
ROUND_LAST (7, context->keys, b, b_next);
buf_put_le32 (output + 0, b_next[0]);
buf_put_le32 (output + 4, b_next[1]);
buf_put_le32 (output + 8, b_next[2]);
buf_put_le32 (output + 12, b_next[3]);
}
static void
serpent_decrypt_internal (serpent_context_t *context,
const byte *input, byte *output)
{
serpent_block_t b, b_next;
int round = ROUNDS;
b_next[0] = buf_get_le32 (input + 0);
b_next[1] = buf_get_le32 (input + 4);
b_next[2] = buf_get_le32 (input + 8);
b_next[3] = buf_get_le32 (input + 12);
ROUND_FIRST_INVERSE (7, context->keys, b_next, b);
ROUND_INVERSE (6, context->keys, b, b_next);
ROUND_INVERSE (5, context->keys, b, b_next);
ROUND_INVERSE (4, context->keys, b, b_next);
ROUND_INVERSE (3, context->keys, b, b_next);
ROUND_INVERSE (2, context->keys, b, b_next);
ROUND_INVERSE (1, context->keys, b, b_next);
ROUND_INVERSE (0, context->keys, b, b_next);
ROUND_INVERSE (7, context->keys, b, b_next);
ROUND_INVERSE (6, context->keys, b, b_next);
ROUND_INVERSE (5, context->keys, b, b_next);
ROUND_INVERSE (4, context->keys, b, b_next);
ROUND_INVERSE (3, context->keys, b, b_next);
ROUND_INVERSE (2, context->keys, b, b_next);
ROUND_INVERSE (1, context->keys, b, b_next);
ROUND_INVERSE (0, context->keys, b, b_next);
ROUND_INVERSE (7, context->keys, b, b_next);
ROUND_INVERSE (6, context->keys, b, b_next);
ROUND_INVERSE (5, context->keys, b, b_next);
ROUND_INVERSE (4, context->keys, b, b_next);
ROUND_INVERSE (3, context->keys, b, b_next);
ROUND_INVERSE (2, context->keys, b, b_next);
ROUND_INVERSE (1, context->keys, b, b_next);
ROUND_INVERSE (0, context->keys, b, b_next);
ROUND_INVERSE (7, context->keys, b, b_next);
ROUND_INVERSE (6, context->keys, b, b_next);
ROUND_INVERSE (5, context->keys, b, b_next);
ROUND_INVERSE (4, context->keys, b, b_next);
ROUND_INVERSE (3, context->keys, b, b_next);
ROUND_INVERSE (2, context->keys, b, b_next);
ROUND_INVERSE (1, context->keys, b, b_next);
ROUND_INVERSE (0, context->keys, b, b_next);
buf_put_le32 (output + 0, b_next[0]);
buf_put_le32 (output + 4, b_next[1]);
buf_put_le32 (output + 8, b_next[2]);
buf_put_le32 (output + 12, b_next[3]);
}
static unsigned int
serpent_encrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
{
serpent_context_t *context = ctx;
serpent_encrypt_internal (context, buffer_in, buffer_out);
return /*burn_stack*/ (2 * sizeof (serpent_block_t));
}
static unsigned int
serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
{
serpent_context_t *context = ctx;
serpent_decrypt_internal (context, buffer_in, buffer_out);
return /*burn_stack*/ (2 * sizeof (serpent_block_t));
}
/* Bulk encryption of complete blocks in CTR mode. This function is only
intended for the bulk encryption feature of cipher.c. CTR is expected to be
of size sizeof(serpent_block_t). */
void
_gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
serpent_context_t *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char tmpbuf[sizeof(serpent_block_t)];
int burn_stack_depth = 2 * sizeof (serpent_block_t);
int i;
#ifdef USE_AVX2
if (ctx->use_avx2)
{
int did_use_avx2 = 0;
/* Process data in 16 block chunks. */
while (nblocks >= 16)
{
_gcry_serpent_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
nblocks -= 16;
outbuf += 16 * sizeof(serpent_block_t);
inbuf += 16 * sizeof(serpent_block_t);
did_use_avx2 = 1;
}
if (did_use_avx2)
{
/* serpent-avx2 assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic/sse2 code to handle smaller chunks... */
/* TODO: use caching instead? */
}
#endif
#ifdef USE_SSE2
{
int did_use_sse2 = 0;
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
_gcry_serpent_sse2_ctr_enc(ctx, outbuf, inbuf, ctr);
nblocks -= 8;
outbuf += 8 * sizeof(serpent_block_t);
inbuf += 8 * sizeof(serpent_block_t);
did_use_sse2 = 1;
}
if (did_use_sse2)
{
/* serpent-sse2 assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic code to handle smaller chunks... */
/* TODO: use caching instead? */
}
#endif
#ifdef USE_NEON
if (ctx->use_neon)
{
int did_use_neon = 0;
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
_gcry_serpent_neon_ctr_enc(ctx, outbuf, inbuf, ctr);
nblocks -= 8;
outbuf += 8 * sizeof(serpent_block_t);
inbuf += 8 * sizeof(serpent_block_t);
did_use_neon = 1;
}
if (did_use_neon)
{
/* serpent-neon assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic code to handle smaller chunks... */
/* TODO: use caching instead? */
}
#endif
for ( ;nblocks; nblocks-- )
{
/* Encrypt the counter. */
serpent_encrypt_internal(ctx, ctr, tmpbuf);
/* XOR the input with the encrypted counter and store in output. */
buf_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t));
outbuf += sizeof(serpent_block_t);
inbuf += sizeof(serpent_block_t);
/* Increment the counter. */
for (i = sizeof(serpent_block_t); i > 0; i--)
{
ctr[i-1]++;
if (ctr[i-1])
break;
}
}
wipememory(tmpbuf, sizeof(tmpbuf));
_gcry_burn_stack(burn_stack_depth);
}
/* Bulk decryption of complete blocks in CBC mode. This function is only
intended for the bulk encryption feature of cipher.c. */
void
_gcry_serpent_cbc_dec(void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
serpent_context_t *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char savebuf[sizeof(serpent_block_t)];
int burn_stack_depth = 2 * sizeof (serpent_block_t);
#ifdef USE_AVX2
if (ctx->use_avx2)
{
int did_use_avx2 = 0;
/* Process data in 16 block chunks. */
while (nblocks >= 16)
{
_gcry_serpent_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
nblocks -= 16;
outbuf += 16 * sizeof(serpent_block_t);
inbuf += 16 * sizeof(serpent_block_t);
did_use_avx2 = 1;
}
if (did_use_avx2)
{
/* serpent-avx2 assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic/sse2 code to handle smaller chunks... */
}
#endif
#ifdef USE_SSE2
{
int did_use_sse2 = 0;
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
_gcry_serpent_sse2_cbc_dec(ctx, outbuf, inbuf, iv);
nblocks -= 8;
outbuf += 8 * sizeof(serpent_block_t);
inbuf += 8 * sizeof(serpent_block_t);
did_use_sse2 = 1;
}
if (did_use_sse2)
{
/* serpent-sse2 assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic code to handle smaller chunks... */
}
#endif
#ifdef USE_NEON
if (ctx->use_neon)
{
int did_use_neon = 0;
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
_gcry_serpent_neon_cbc_dec(ctx, outbuf, inbuf, iv);
nblocks -= 8;
outbuf += 8 * sizeof(serpent_block_t);
inbuf += 8 * sizeof(serpent_block_t);
did_use_neon = 1;
}
if (did_use_neon)
{
/* serpent-neon assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic code to handle smaller chunks... */
}
#endif
for ( ;nblocks; nblocks-- )
{
/* INBUF is needed later and it may be identical to OUTBUF, so store
the intermediate result to SAVEBUF. */
serpent_decrypt_internal (ctx, inbuf, savebuf);
buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, sizeof(serpent_block_t));
inbuf += sizeof(serpent_block_t);
outbuf += sizeof(serpent_block_t);
}
wipememory(savebuf, sizeof(savebuf));
_gcry_burn_stack(burn_stack_depth);
}
/* Bulk decryption of complete blocks in CFB mode. This function is only
intended for the bulk encryption feature of cipher.c. */
void
_gcry_serpent_cfb_dec(void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
serpent_context_t *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 2 * sizeof (serpent_block_t);
#ifdef USE_AVX2
if (ctx->use_avx2)
{
int did_use_avx2 = 0;
/* Process data in 16 block chunks. */
while (nblocks >= 16)
{
_gcry_serpent_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
nblocks -= 16;
outbuf += 16 * sizeof(serpent_block_t);
inbuf += 16 * sizeof(serpent_block_t);
did_use_avx2 = 1;
}
if (did_use_avx2)
{
/* serpent-avx2 assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic/sse2 code to handle smaller chunks... */
}
#endif
#ifdef USE_SSE2
{
int did_use_sse2 = 0;
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
_gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv);
nblocks -= 8;
outbuf += 8 * sizeof(serpent_block_t);
inbuf += 8 * sizeof(serpent_block_t);
did_use_sse2 = 1;
}
if (did_use_sse2)
{
/* serpent-sse2 assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic code to handle smaller chunks... */
}
#endif
#ifdef USE_NEON
if (ctx->use_neon)
{
int did_use_neon = 0;
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
_gcry_serpent_neon_cfb_dec(ctx, outbuf, inbuf, iv);
nblocks -= 8;
outbuf += 8 * sizeof(serpent_block_t);
inbuf += 8 * sizeof(serpent_block_t);
did_use_neon = 1;
}
if (did_use_neon)
{
/* serpent-neon assembly code does not use stack */
if (nblocks == 0)
burn_stack_depth = 0;
}
/* Use generic code to handle smaller chunks... */
}
#endif
for ( ;nblocks; nblocks-- )
{
serpent_encrypt_internal(ctx, iv, iv);
buf_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t));
outbuf += sizeof(serpent_block_t);
inbuf += sizeof(serpent_block_t);
}
_gcry_burn_stack(burn_stack_depth);
}
/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
encryption. Returns NULL on success. */
static const char*
selftest_ctr_128 (void)
{
const int nblocks = 16+8+1;
const int blocksize = sizeof(serpent_block_t);
const int context_size = sizeof(serpent_context_t);
return _gcry_selftest_helper_ctr("SERPENT", &serpent_setkey,
&serpent_encrypt, &_gcry_serpent_ctr_enc, nblocks, blocksize,
context_size);
}
/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
Returns NULL on success. */
static const char*
selftest_cbc_128 (void)
{
const int nblocks = 16+8+2;
const int blocksize = sizeof(serpent_block_t);
const int context_size = sizeof(serpent_context_t);
return _gcry_selftest_helper_cbc("SERPENT", &serpent_setkey,
&serpent_encrypt, &_gcry_serpent_cbc_dec, nblocks, blocksize,
context_size);
}
/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
Returns NULL on success. */
static const char*
selftest_cfb_128 (void)
{
const int nblocks = 16+8+2;
const int blocksize = sizeof(serpent_block_t);
const int context_size = sizeof(serpent_context_t);
return _gcry_selftest_helper_cfb("SERPENT", &serpent_setkey,
&serpent_encrypt, &_gcry_serpent_cfb_dec, nblocks, blocksize,
context_size);
}
/* Serpent test. */
static const char *
serpent_test (void)
{
serpent_context_t context;
unsigned char scratch[16];
unsigned int i;
const char *r;
static struct test
{
int key_length;
unsigned char key[32];
unsigned char text_plain[16];
unsigned char text_cipher[16];
} test_data[] =
{
{
16,
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
"\xD2\x9D\x57\x6F\xCE\xA3\xA3\xA7\xED\x90\x99\xF2\x92\x73\xD7\x8E",
"\xB2\x28\x8B\x96\x8A\xE8\xB0\x86\x48\xD1\xCE\x96\x06\xFD\x99\x2D"
},
{
24,
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00",
"\xD2\x9D\x57\x6F\xCE\xAB\xA3\xA7\xED\x98\x99\xF2\x92\x7B\xD7\x8E",
"\x13\x0E\x35\x3E\x10\x37\xC2\x24\x05\xE8\xFA\xEF\xB2\xC3\xC3\xE9"
},
{
32,
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
"\xD0\x95\x57\x6F\xCE\xA3\xE3\xA7\xED\x98\xD9\xF2\x90\x73\xD7\x8E",
"\xB9\x0E\xE5\x86\x2D\xE6\x91\x68\xF2\xBD\xD5\x12\x5B\x45\x47\x2B"
},
{
32,
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
"\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00",
"\x20\x61\xA4\x27\x82\xBD\x52\xEC\x69\x1E\xC3\x83\xB0\x3B\xA7\x7C"
},
{
0
},
};
for (i = 0; test_data[i].key_length; i++)
{
serpent_setkey_internal (&context, test_data[i].key,
test_data[i].key_length);
serpent_encrypt_internal (&context, test_data[i].text_plain, scratch);
if (memcmp (scratch, test_data[i].text_cipher, sizeof (serpent_block_t)))
switch (test_data[i].key_length)
{
case 16:
return "Serpent-128 test encryption failed.";
case 24:
return "Serpent-192 test encryption failed.";
case 32:
return "Serpent-256 test encryption failed.";
}
serpent_decrypt_internal (&context, test_data[i].text_cipher, scratch);
if (memcmp (scratch, test_data[i].text_plain, sizeof (serpent_block_t)))
switch (test_data[i].key_length)
{
case 16:
return "Serpent-128 test decryption failed.";
case 24:
return "Serpent-192 test decryption failed.";
case 32:
return "Serpent-256 test decryption failed.";
}
}
if ( (r = selftest_ctr_128 ()) )
return r;
if ( (r = selftest_cbc_128 ()) )
return r;
if ( (r = selftest_cfb_128 ()) )
return r;
return NULL;
}
/* "SERPENT" is an alias for "SERPENT128". */
static const char *cipher_spec_serpent128_aliases[] =
{
"SERPENT",
NULL
};
gcry_cipher_spec_t _gcry_cipher_spec_serpent128 =
{
GCRY_CIPHER_SERPENT128, {0, 0},
"SERPENT128", cipher_spec_serpent128_aliases, NULL, 16, 128,
sizeof (serpent_context_t),
serpent_setkey, serpent_encrypt, serpent_decrypt
};
gcry_cipher_spec_t _gcry_cipher_spec_serpent192 =
{
GCRY_CIPHER_SERPENT192, {0, 0},
"SERPENT192", NULL, NULL, 16, 192,
sizeof (serpent_context_t),
serpent_setkey, serpent_encrypt, serpent_decrypt
};
gcry_cipher_spec_t _gcry_cipher_spec_serpent256 =
{
GCRY_CIPHER_SERPENT256, {0, 0},
"SERPENT256", NULL, NULL, 16, 256,
sizeof (serpent_context_t),
serpent_setkey, serpent_encrypt, serpent_decrypt
};
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 65bd6866..00c57dd4 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -1,551 +1,552 @@
/* sha1.c - SHA1 hash function
* Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see .
*/
/* Test vectors:
*
* "abc"
* A999 3E36 4706 816A BA3E 2571 7850 C26C 9CD0 D89D
*
* "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
* 8498 3E44 1C3B D26E BAAE 4AA1 F951 29E5 E546 70F1
*/
#include
#include
#include
#include
#ifdef HAVE_STDINT_H
# include
#endif
#include "g10lib.h"
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher.h"
#include "hash-common.h"
/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
#undef USE_SSSE3
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_SSSE3)
# define USE_SSSE3 1
#endif
/* USE_AVX indicates whether to compile with Intel AVX code. */
#undef USE_AVX
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_AVX)
# define USE_AVX 1
#endif
/* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */
#undef USE_BMI2
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_AVX) && defined(HAVE_GCC_INLINE_ASM_BMI2)
# define USE_BMI2 1
#endif
/* USE_NEON indicates whether to enable ARM NEON assembly code. */
#undef USE_NEON
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
# define USE_NEON 1
# endif
-#endif
+#endif /*ENABLE_NEON_SUPPORT*/
/* A macro to test whether P is properly aligned for an u32 type.
Note that config.h provides a suitable replacement for uintptr_t if
it does not exist in stdint.h. */
/* #if __GNUC__ >= 2 */
/* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % __alignof__ (u32))) */
/* #else */
/* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % sizeof (u32))) */
/* #endif */
typedef struct
{
gcry_md_block_ctx_t bctx;
u32 h0,h1,h2,h3,h4;
#ifdef USE_SSSE3
unsigned int use_ssse3:1;
#endif
#ifdef USE_AVX
unsigned int use_avx:1;
#endif
#ifdef USE_BMI2
unsigned int use_bmi2:1;
#endif
#ifdef USE_NEON
unsigned int use_neon:1;
#endif
} SHA1_CONTEXT;
static unsigned int
transform (void *c, const unsigned char *data, size_t nblks);
static void
sha1_init (void *context, unsigned int flags)
{
SHA1_CONTEXT *hd = context;
unsigned int features = _gcry_get_hw_features ();
(void)flags;
hd->h0 = 0x67452301;
hd->h1 = 0xefcdab89;
hd->h2 = 0x98badcfe;
hd->h3 = 0x10325476;
hd->h4 = 0xc3d2e1f0;
hd->bctx.nblocks = 0;
hd->bctx.nblocks_high = 0;
hd->bctx.count = 0;
hd->bctx.blocksize = 64;
hd->bctx.bwrite = transform;
#ifdef USE_SSSE3
hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
#endif
#ifdef USE_AVX
/* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
* Therefore use this implementation on Intel CPUs only. */
hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
#endif
#ifdef USE_BMI2
hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2);
#endif
#ifdef USE_NEON
hd->use_neon = (features & HWF_ARM_NEON) != 0;
#endif
(void)features;
}
/* Round function macros. */
#define K1 0x5A827999L
#define K2 0x6ED9EBA1L
#define K3 0x8F1BBCDCL
#define K4 0xCA62C1D6L
#define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) )
#define F2(x,y,z) ( x ^ y ^ z )
#define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) )
#define F4(x,y,z) ( x ^ y ^ z )
#define M(i) ( tm = x[ i &0x0f] \
^ x[(i-14)&0x0f] \
^ x[(i-8) &0x0f] \
^ x[(i-3) &0x0f], \
(x[i&0x0f] = rol(tm, 1)))
#define R(a,b,c,d,e,f,k,m) do { e += rol( a, 5 ) \
+ f( b, c, d ) \
+ k \
+ m; \
b = rol( b, 30 ); \
} while(0)
#ifdef USE_NEON
unsigned int
_gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data,
size_t nblks);
#endif
/*
* Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA.
*/
static unsigned int
transform_blk (void *ctx, const unsigned char *data)
{
SHA1_CONTEXT *hd = ctx;
const u32 *idata = (const void *)data;
register u32 a, b, c, d, e; /* Local copies of the chaining variables. */
register u32 tm; /* Helper. */
u32 x[16]; /* The array we work on. */
#define I(i) (x[i] = buf_get_be32(idata + i))
/* Get the values of the chaining variables. */
a = hd->h0;
b = hd->h1;
c = hd->h2;
d = hd->h3;
e = hd->h4;
/* Transform. */
R( a, b, c, d, e, F1, K1, I( 0) );
R( e, a, b, c, d, F1, K1, I( 1) );
R( d, e, a, b, c, F1, K1, I( 2) );
R( c, d, e, a, b, F1, K1, I( 3) );
R( b, c, d, e, a, F1, K1, I( 4) );
R( a, b, c, d, e, F1, K1, I( 5) );
R( e, a, b, c, d, F1, K1, I( 6) );
R( d, e, a, b, c, F1, K1, I( 7) );
R( c, d, e, a, b, F1, K1, I( 8) );
R( b, c, d, e, a, F1, K1, I( 9) );
R( a, b, c, d, e, F1, K1, I(10) );
R( e, a, b, c, d, F1, K1, I(11) );
R( d, e, a, b, c, F1, K1, I(12) );
R( c, d, e, a, b, F1, K1, I(13) );
R( b, c, d, e, a, F1, K1, I(14) );
R( a, b, c, d, e, F1, K1, I(15) );
R( e, a, b, c, d, F1, K1, M(16) );
R( d, e, a, b, c, F1, K1, M(17) );
R( c, d, e, a, b, F1, K1, M(18) );
R( b, c, d, e, a, F1, K1, M(19) );
R( a, b, c, d, e, F2, K2, M(20) );
R( e, a, b, c, d, F2, K2, M(21) );
R( d, e, a, b, c, F2, K2, M(22) );
R( c, d, e, a, b, F2, K2, M(23) );
R( b, c, d, e, a, F2, K2, M(24) );
R( a, b, c, d, e, F2, K2, M(25) );
R( e, a, b, c, d, F2, K2, M(26) );
R( d, e, a, b, c, F2, K2, M(27) );
R( c, d, e, a, b, F2, K2, M(28) );
R( b, c, d, e, a, F2, K2, M(29) );
R( a, b, c, d, e, F2, K2, M(30) );
R( e, a, b, c, d, F2, K2, M(31) );
R( d, e, a, b, c, F2, K2, M(32) );
R( c, d, e, a, b, F2, K2, M(33) );
R( b, c, d, e, a, F2, K2, M(34) );
R( a, b, c, d, e, F2, K2, M(35) );
R( e, a, b, c, d, F2, K2, M(36) );
R( d, e, a, b, c, F2, K2, M(37) );
R( c, d, e, a, b, F2, K2, M(38) );
R( b, c, d, e, a, F2, K2, M(39) );
R( a, b, c, d, e, F3, K3, M(40) );
R( e, a, b, c, d, F3, K3, M(41) );
R( d, e, a, b, c, F3, K3, M(42) );
R( c, d, e, a, b, F3, K3, M(43) );
R( b, c, d, e, a, F3, K3, M(44) );
R( a, b, c, d, e, F3, K3, M(45) );
R( e, a, b, c, d, F3, K3, M(46) );
R( d, e, a, b, c, F3, K3, M(47) );
R( c, d, e, a, b, F3, K3, M(48) );
R( b, c, d, e, a, F3, K3, M(49) );
R( a, b, c, d, e, F3, K3, M(50) );
R( e, a, b, c, d, F3, K3, M(51) );
R( d, e, a, b, c, F3, K3, M(52) );
R( c, d, e, a, b, F3, K3, M(53) );
R( b, c, d, e, a, F3, K3, M(54) );
R( a, b, c, d, e, F3, K3, M(55) );
R( e, a, b, c, d, F3, K3, M(56) );
R( d, e, a, b, c, F3, K3, M(57) );
R( c, d, e, a, b, F3, K3, M(58) );
R( b, c, d, e, a, F3, K3, M(59) );
R( a, b, c, d, e, F4, K4, M(60) );
R( e, a, b, c, d, F4, K4, M(61) );
R( d, e, a, b, c, F4, K4, M(62) );
R( c, d, e, a, b, F4, K4, M(63) );
R( b, c, d, e, a, F4, K4, M(64) );
R( a, b, c, d, e, F4, K4, M(65) );
R( e, a, b, c, d, F4, K4, M(66) );
R( d, e, a, b, c, F4, K4, M(67) );
R( c, d, e, a, b, F4, K4, M(68) );
R( b, c, d, e, a, F4, K4, M(69) );
R( a, b, c, d, e, F4, K4, M(70) );
R( e, a, b, c, d, F4, K4, M(71) );
R( d, e, a, b, c, F4, K4, M(72) );
R( c, d, e, a, b, F4, K4, M(73) );
R( b, c, d, e, a, F4, K4, M(74) );
R( a, b, c, d, e, F4, K4, M(75) );
R( e, a, b, c, d, F4, K4, M(76) );
R( d, e, a, b, c, F4, K4, M(77) );
R( c, d, e, a, b, F4, K4, M(78) );
R( b, c, d, e, a, F4, K4, M(79) );
/* Update the chaining variables. */
hd->h0 += a;
hd->h1 += b;
hd->h2 += c;
hd->h3 += d;
hd->h4 += e;
return /* burn_stack */ 88+4*sizeof(void*);
}
#ifdef USE_SSSE3
unsigned int
_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data,
size_t nblks);
#endif
#ifdef USE_AVX
unsigned int
_gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data,
size_t nblks);
#endif
#ifdef USE_BMI2
unsigned int
_gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
size_t nblks);
#endif
static unsigned int
transform (void *ctx, const unsigned char *data, size_t nblks)
{
SHA1_CONTEXT *hd = ctx;
unsigned int burn;
#ifdef USE_BMI2
if (hd->use_bmi2)
return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
+ 4 * sizeof(void*);
#endif
#ifdef USE_AVX
if (hd->use_avx)
return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks)
+ 4 * sizeof(void*);
#endif
#ifdef USE_SSSE3
if (hd->use_ssse3)
return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks)
+ 4 * sizeof(void*);
#endif
#ifdef USE_NEON
if (hd->use_neon)
return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks)
+ 4 * sizeof(void*);
#endif
do
{
burn = transform_blk (hd, data);
data += 64;
}
while (--nblks);
return burn;
}
/* The routine final terminates the computation and
* returns the digest.
* The handle is prepared for a new cycle, but adding bytes to the
* handle will the destroy the returned buffer.
* Returns: 20 bytes representing the digest.
*/
static void
sha1_final(void *context)
{
SHA1_CONTEXT *hd = context;
u32 t, th, msb, lsb;
unsigned char *p;
unsigned int burn;
_gcry_md_block_write (hd, NULL, 0); /* flush */;
t = hd->bctx.nblocks;
if (sizeof t == sizeof hd->bctx.nblocks)
th = hd->bctx.nblocks_high;
else
th = hd->bctx.nblocks >> 32;
/* multiply by 64 to make a byte count */
lsb = t << 6;
msb = (th << 6) | (t >> 26);
/* add the count */
t = lsb;
if( (lsb += hd->bctx.count) < t )
msb++;
/* multiply by 8 to make a bit count */
t = lsb;
lsb <<= 3;
msb <<= 3;
msb |= t >> 29;
if( hd->bctx.count < 56 ) /* enough room */
{
hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
while( hd->bctx.count < 56 )
hd->bctx.buf[hd->bctx.count++] = 0; /* pad */
}
else /* need one extra block */
{
hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
while( hd->bctx.count < 64 )
hd->bctx.buf[hd->bctx.count++] = 0;
_gcry_md_block_write(hd, NULL, 0); /* flush */;
memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
}
/* append the 64 bit count */
buf_put_be32(hd->bctx.buf + 56, msb);
buf_put_be32(hd->bctx.buf + 60, lsb);
burn = transform( hd, hd->bctx.buf, 1 );
_gcry_burn_stack (burn);
p = hd->bctx.buf;
#define X(a) do { *(u32*)p = be_bswap32(hd->h##a) ; p += 4; } while(0)
X(0);
X(1);
X(2);
X(3);
X(4);
#undef X
}
static unsigned char *
sha1_read( void *context )
{
SHA1_CONTEXT *hd = context;
return hd->bctx.buf;
}
/****************
* Shortcut functions which puts the hash value of the supplied buffer
* into outbuf which must have a size of 20 bytes.
*/
void
_gcry_sha1_hash_buffer (void *outbuf, const void *buffer, size_t length)
{
SHA1_CONTEXT hd;
sha1_init (&hd, 0);
_gcry_md_block_write (&hd, buffer, length);
sha1_final (&hd);
memcpy (outbuf, hd.bctx.buf, 20);
}
/* Variant of the above shortcut function using a multiple buffers. */
void
_gcry_sha1_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
{
SHA1_CONTEXT hd;
sha1_init (&hd, 0);
for (;iovcnt > 0; iov++, iovcnt--)
_gcry_md_block_write (&hd,
(const char*)iov[0].data + iov[0].off, iov[0].len);
sha1_final (&hd);
memcpy (outbuf, hd.bctx.buf, 20);
}
/*
Self-test section.
*/
static gpg_err_code_t
selftests_sha1 (int extended, selftest_report_func_t report)
{
const char *what;
const char *errtxt;
what = "short string";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA1, 0,
"abc", 3,
"\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E"
"\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D", 20);
if (errtxt)
goto failed;
if (extended)
{
what = "long string";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA1, 0,
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
"\x84\x98\x3E\x44\x1C\x3B\xD2\x6E\xBA\xAE"
"\x4A\xA1\xF9\x51\x29\xE5\xE5\x46\x70\xF1", 20);
if (errtxt)
goto failed;
what = "one million \"a\"";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA1, 1,
NULL, 0,
"\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E"
"\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F", 20);
if (errtxt)
goto failed;
}
return 0; /* Succeeded. */
failed:
if (report)
report ("digest", GCRY_MD_SHA1, what, errtxt);
return GPG_ERR_SELFTEST_FAILED;
}
/* Run a full self-test for ALGO and return 0 on success. */
static gpg_err_code_t
run_selftests (int algo, int extended, selftest_report_func_t report)
{
gpg_err_code_t ec;
switch (algo)
{
case GCRY_MD_SHA1:
ec = selftests_sha1 (extended, report);
break;
default:
ec = GPG_ERR_DIGEST_ALGO;
break;
}
return ec;
}
static unsigned char asn[15] = /* Object ID is 1.3.14.3.2.26 */
{ 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03,
0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 };
static gcry_md_oid_spec_t oid_spec_sha1[] =
{
/* iso.member-body.us.rsadsi.pkcs.pkcs-1.5 (sha1WithRSAEncryption) */
{ "1.2.840.113549.1.1.5" },
/* iso.member-body.us.x9-57.x9cm.3 (dsaWithSha1)*/
{ "1.2.840.10040.4.3" },
/* from NIST's OIW (sha1) */
{ "1.3.14.3.2.26" },
/* from NIST OIW (sha-1WithRSAEncryption) */
{ "1.3.14.3.2.29" },
/* iso.member-body.us.ansi-x9-62.signatures.ecdsa-with-sha1 */
{ "1.2.840.10045.4.1" },
{ NULL },
};
gcry_md_spec_t _gcry_digest_spec_sha1 =
{
GCRY_MD_SHA1, {0, 1},
"SHA1", asn, DIM (asn), oid_spec_sha1, 20,
sha1_init, _gcry_md_block_write, sha1_final, sha1_read,
sizeof (SHA1_CONTEXT),
run_selftests
};
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 92b49131..7d60df0f 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -1,878 +1,879 @@
/* sha512.c - SHA384 and SHA512 hash functions
* Copyright (C) 2003, 2008, 2009 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser general Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see .
*/
/* Test vectors from FIPS-180-2:
*
* "abc"
* 384:
* CB00753F 45A35E8B B5A03D69 9AC65007 272C32AB 0EDED163
* 1A8B605A 43FF5BED 8086072B A1E7CC23 58BAECA1 34C825A7
* 512:
* DDAF35A1 93617ABA CC417349 AE204131 12E6FA4E 89A97EA2 0A9EEEE6 4B55D39A
* 2192992A 274FC1A8 36BA3C23 A3FEEBBD 454D4423 643CE80E 2A9AC94F A54CA49F
*
* "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu"
* 384:
* 09330C33 F71147E8 3D192FC7 82CD1B47 53111B17 3B3B05D2
* 2FA08086 E3B0F712 FCC7C71A 557E2DB9 66C3E9FA 91746039
* 512:
* 8E959B75 DAE313DA 8CF4F728 14FC143F 8F7779C6 EB9F7FA1 7299AEAD B6889018
* 501D289E 4900F7E4 331B99DE C4B5433A C7D329EE B6DD2654 5E96E55B 874BE909
*
* "a" x 1000000
* 384:
* 9D0E1809 716474CB 086E834E 310A4A1C ED149E9C 00F24852
* 7972CEC5 704C2A5B 07B8B3DC 38ECC4EB AE97DDD8 7F3D8985
* 512:
* E718483D 0CE76964 4E2E42C7 BC15B463 8E1F98B1 3B204428 5632A803 AFA973EB
* DE0FF244 877EA60A 4CB0432C E577C31B EB009C5C 2C49AA2E 4EADB217 AD8CC09B
*/
#include
#include
#include "g10lib.h"
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher.h"
#include "hash-common.h"
/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
#undef USE_ARM_NEON_ASM
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_NEON)
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
# define USE_ARM_NEON_ASM 1
# endif
-#endif
+#endif /*ENABLE_NEON_SUPPORT*/
/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
#undef USE_SSSE3
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
# define USE_SSSE3 1
#endif
/* USE_AVX indicates whether to compile with Intel AVX code. */
#undef USE_AVX
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_AVX) && \
defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
# define USE_AVX 1
#endif
/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
#undef USE_AVX2
#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
# define USE_AVX2 1
#endif
typedef struct
{
u64 h0, h1, h2, h3, h4, h5, h6, h7;
} SHA512_STATE;
typedef struct
{
gcry_md_block_ctx_t bctx;
SHA512_STATE state;
#ifdef USE_ARM_NEON_ASM
unsigned int use_neon:1;
#endif
#ifdef USE_SSSE3
unsigned int use_ssse3:1;
#endif
#ifdef USE_AVX
unsigned int use_avx:1;
#endif
#ifdef USE_AVX2
unsigned int use_avx2:1;
#endif
} SHA512_CONTEXT;
static unsigned int
transform (void *context, const unsigned char *data, size_t nblks);
static void
sha512_init (void *context, unsigned int flags)
{
SHA512_CONTEXT *ctx = context;
SHA512_STATE *hd = &ctx->state;
unsigned int features = _gcry_get_hw_features ();
(void)flags;
hd->h0 = U64_C(0x6a09e667f3bcc908);
hd->h1 = U64_C(0xbb67ae8584caa73b);
hd->h2 = U64_C(0x3c6ef372fe94f82b);
hd->h3 = U64_C(0xa54ff53a5f1d36f1);
hd->h4 = U64_C(0x510e527fade682d1);
hd->h5 = U64_C(0x9b05688c2b3e6c1f);
hd->h6 = U64_C(0x1f83d9abfb41bd6b);
hd->h7 = U64_C(0x5be0cd19137e2179);
ctx->bctx.nblocks = 0;
ctx->bctx.nblocks_high = 0;
ctx->bctx.count = 0;
ctx->bctx.blocksize = 128;
ctx->bctx.bwrite = transform;
#ifdef USE_ARM_NEON_ASM
ctx->use_neon = (features & HWF_ARM_NEON) != 0;
#endif
#ifdef USE_SSSE3
ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
#endif
#ifdef USE_AVX
ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
#endif
#ifdef USE_AVX2
ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
#endif
(void)features;
}
static void
sha384_init (void *context, unsigned int flags)
{
SHA512_CONTEXT *ctx = context;
SHA512_STATE *hd = &ctx->state;
unsigned int features = _gcry_get_hw_features ();
(void)flags;
hd->h0 = U64_C(0xcbbb9d5dc1059ed8);
hd->h1 = U64_C(0x629a292a367cd507);
hd->h2 = U64_C(0x9159015a3070dd17);
hd->h3 = U64_C(0x152fecd8f70e5939);
hd->h4 = U64_C(0x67332667ffc00b31);
hd->h5 = U64_C(0x8eb44a8768581511);
hd->h6 = U64_C(0xdb0c2e0d64f98fa7);
hd->h7 = U64_C(0x47b5481dbefa4fa4);
ctx->bctx.nblocks = 0;
ctx->bctx.nblocks_high = 0;
ctx->bctx.count = 0;
ctx->bctx.blocksize = 128;
ctx->bctx.bwrite = transform;
#ifdef USE_ARM_NEON_ASM
ctx->use_neon = (features & HWF_ARM_NEON) != 0;
#endif
#ifdef USE_SSSE3
ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
#endif
#ifdef USE_AVX
ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
#endif
#ifdef USE_AVX2
ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
#endif
(void)features;
}
static inline u64
ROTR (u64 x, u64 n)
{
return ((x >> n) | (x << (64 - n)));
}
static inline u64
Ch (u64 x, u64 y, u64 z)
{
return ((x & y) ^ ( ~x & z));
}
static inline u64
Maj (u64 x, u64 y, u64 z)
{
return ((x & y) ^ (x & z) ^ (y & z));
}
static inline u64
Sum0 (u64 x)
{
return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
}
static inline u64
Sum1 (u64 x)
{
return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
}
static const u64 k[] =
{
U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
};
/****************
* Transform the message W which consists of 16 64-bit-words
*/
static unsigned int
transform_blk (SHA512_STATE *hd, const unsigned char *data)
{
u64 a, b, c, d, e, f, g, h;
u64 w[16];
int t;
/* get values from the chaining vars */
a = hd->h0;
b = hd->h1;
c = hd->h2;
d = hd->h3;
e = hd->h4;
f = hd->h5;
g = hd->h6;
h = hd->h7;
for ( t = 0; t < 16; t++ )
w[t] = buf_get_be64(data + t * 8);
#define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
#define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
for (t = 0; t < 80 - 16; )
{
u64 t1, t2;
/* Performance on a AMD Athlon(tm) Dual Core Processor 4050e
with gcc 4.3.3 using gcry_md_hash_buffer of each 10000 bytes
initialized to 0,1,2,3...255,0,... and 1000 iterations:
Not unrolled with macros: 440ms
Unrolled with macros: 350ms
Unrolled with inline: 330ms
*/
#if 0 /* Not unrolled. */
t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
w[t%16] += S1 (w[(t - 2)%16]) + w[(t - 7)%16] + S0 (w[(t - 15)%16]);
t2 = Sum0 (a) + Maj (a, b, c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
t++;
#else /* Unrolled to interweave the chain variables. */
t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
w[0] += S1 (w[14]) + w[9] + S0 (w[1]);
t2 = Sum0 (a) + Maj (a, b, c);
d += t1;
h = t1 + t2;
t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
w[1] += S1 (w[15]) + w[10] + S0 (w[2]);
t2 = Sum0 (h) + Maj (h, a, b);
c += t1;
g = t1 + t2;
t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
w[2] += S1 (w[0]) + w[11] + S0 (w[3]);
t2 = Sum0 (g) + Maj (g, h, a);
b += t1;
f = t1 + t2;
t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
w[3] += S1 (w[1]) + w[12] + S0 (w[4]);
t2 = Sum0 (f) + Maj (f, g, h);
a += t1;
e = t1 + t2;
t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
w[4] += S1 (w[2]) + w[13] + S0 (w[5]);
t2 = Sum0 (e) + Maj (e, f, g);
h += t1;
d = t1 + t2;
t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
w[5] += S1 (w[3]) + w[14] + S0 (w[6]);
t2 = Sum0 (d) + Maj (d, e, f);
g += t1;
c = t1 + t2;
t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
w[6] += S1 (w[4]) + w[15] + S0 (w[7]);
t2 = Sum0 (c) + Maj (c, d, e);
f += t1;
b = t1 + t2;
t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
w[7] += S1 (w[5]) + w[0] + S0 (w[8]);
t2 = Sum0 (b) + Maj (b, c, d);
e += t1;
a = t1 + t2;
t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
w[8] += S1 (w[6]) + w[1] + S0 (w[9]);
t2 = Sum0 (a) + Maj (a, b, c);
d += t1;
h = t1 + t2;
t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
w[9] += S1 (w[7]) + w[2] + S0 (w[10]);
t2 = Sum0 (h) + Maj (h, a, b);
c += t1;
g = t1 + t2;
t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
w[10] += S1 (w[8]) + w[3] + S0 (w[11]);
t2 = Sum0 (g) + Maj (g, h, a);
b += t1;
f = t1 + t2;
t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
w[11] += S1 (w[9]) + w[4] + S0 (w[12]);
t2 = Sum0 (f) + Maj (f, g, h);
a += t1;
e = t1 + t2;
t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
w[12] += S1 (w[10]) + w[5] + S0 (w[13]);
t2 = Sum0 (e) + Maj (e, f, g);
h += t1;
d = t1 + t2;
t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
w[13] += S1 (w[11]) + w[6] + S0 (w[14]);
t2 = Sum0 (d) + Maj (d, e, f);
g += t1;
c = t1 + t2;
t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
w[14] += S1 (w[12]) + w[7] + S0 (w[15]);
t2 = Sum0 (c) + Maj (c, d, e);
f += t1;
b = t1 + t2;
t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
w[15] += S1 (w[13]) + w[8] + S0 (w[0]);
t2 = Sum0 (b) + Maj (b, c, d);
e += t1;
a = t1 + t2;
t += 16;
#endif
}
for (; t < 80; )
{
u64 t1, t2;
#if 0 /* Not unrolled. */
t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
t2 = Sum0 (a) + Maj (a, b, c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
t++;
#else /* Unrolled to interweave the chain variables. */
t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
t2 = Sum0 (a) + Maj (a, b, c);
d += t1;
h = t1 + t2;
t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
t2 = Sum0 (h) + Maj (h, a, b);
c += t1;
g = t1 + t2;
t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
t2 = Sum0 (g) + Maj (g, h, a);
b += t1;
f = t1 + t2;
t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
t2 = Sum0 (f) + Maj (f, g, h);
a += t1;
e = t1 + t2;
t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
t2 = Sum0 (e) + Maj (e, f, g);
h += t1;
d = t1 + t2;
t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
t2 = Sum0 (d) + Maj (d, e, f);
g += t1;
c = t1 + t2;
t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
t2 = Sum0 (c) + Maj (c, d, e);
f += t1;
b = t1 + t2;
t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
t2 = Sum0 (b) + Maj (b, c, d);
e += t1;
a = t1 + t2;
t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
t2 = Sum0 (a) + Maj (a, b, c);
d += t1;
h = t1 + t2;
t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
t2 = Sum0 (h) + Maj (h, a, b);
c += t1;
g = t1 + t2;
t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
t2 = Sum0 (g) + Maj (g, h, a);
b += t1;
f = t1 + t2;
t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
t2 = Sum0 (f) + Maj (f, g, h);
a += t1;
e = t1 + t2;
t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
t2 = Sum0 (e) + Maj (e, f, g);
h += t1;
d = t1 + t2;
t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
t2 = Sum0 (d) + Maj (d, e, f);
g += t1;
c = t1 + t2;
t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
t2 = Sum0 (c) + Maj (c, d, e);
f += t1;
b = t1 + t2;
t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
t2 = Sum0 (b) + Maj (b, c, d);
e += t1;
a = t1 + t2;
t += 16;
#endif
}
/* Update chaining vars. */
hd->h0 += a;
hd->h1 += b;
hd->h2 += c;
hd->h3 += d;
hd->h4 += e;
hd->h5 += f;
hd->h6 += g;
hd->h7 += h;
return /* burn_stack */ (8 + 16) * sizeof(u64) + sizeof(u32) +
3 * sizeof(void*);
}
#ifdef USE_ARM_NEON_ASM
void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
const unsigned char *data,
const u64 k[], size_t num_blks);
#endif
#ifdef USE_SSSE3
unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
void *state, size_t num_blks);
#endif
#ifdef USE_AVX
unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data,
void *state, size_t num_blks);
#endif
#ifdef USE_AVX2
unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data,
void *state, size_t num_blks);
#endif
static unsigned int
transform (void *context, const unsigned char *data, size_t nblks)
{
SHA512_CONTEXT *ctx = context;
unsigned int burn;
#ifdef USE_AVX2
if (ctx->use_avx2)
return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, nblks)
+ 4 * sizeof(void*);
#endif
#ifdef USE_AVX
if (ctx->use_avx)
return _gcry_sha512_transform_amd64_avx (data, &ctx->state, nblks)
+ 4 * sizeof(void*);
#endif
#ifdef USE_SSSE3
if (ctx->use_ssse3)
return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, nblks)
+ 4 * sizeof(void*);
#endif
#ifdef USE_ARM_NEON_ASM
if (ctx->use_neon)
{
_gcry_sha512_transform_armv7_neon (&ctx->state, data, k, nblks);
/* _gcry_sha512_transform_armv7_neon does not store sensitive data
* to stack. */
return /* no burn_stack */ 0;
}
#endif
do
{
burn = transform_blk (&ctx->state, data) + 3 * sizeof(void*);
data += 128;
}
while (--nblks);
return burn;
}
/* The routine final terminates the computation and
* returns the digest.
* The handle is prepared for a new cycle, but adding bytes to the
* handle will the destroy the returned buffer.
* Returns: 64 bytes representing the digest. When used for sha384,
* we take the leftmost 48 of those bytes.
*/
static void
sha512_final (void *context)
{
SHA512_CONTEXT *hd = context;
unsigned int stack_burn_depth;
u64 t, th, msb, lsb;
byte *p;
_gcry_md_block_write (context, NULL, 0); /* flush */ ;
t = hd->bctx.nblocks;
/* if (sizeof t == sizeof hd->bctx.nblocks) */
th = hd->bctx.nblocks_high;
/* else */
/* th = hd->bctx.nblocks >> 64; In case we ever use u128 */
/* multiply by 128 to make a byte count */
lsb = t << 7;
msb = (th << 7) | (t >> 57);
/* add the count */
t = lsb;
if ((lsb += hd->bctx.count) < t)
msb++;
/* multiply by 8 to make a bit count */
t = lsb;
lsb <<= 3;
msb <<= 3;
msb |= t >> 61;
if (hd->bctx.count < 112)
{ /* enough room */
hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
while (hd->bctx.count < 112)
hd->bctx.buf[hd->bctx.count++] = 0; /* pad */
}
else
{ /* need one extra block */
hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
while (hd->bctx.count < 128)
hd->bctx.buf[hd->bctx.count++] = 0;
_gcry_md_block_write (context, NULL, 0); /* flush */ ;
memset (hd->bctx.buf, 0, 112); /* fill next block with zeroes */
}
/* append the 128 bit count */
buf_put_be64(hd->bctx.buf + 112, msb);
buf_put_be64(hd->bctx.buf + 120, lsb);
stack_burn_depth = transform (hd, hd->bctx.buf, 1);
_gcry_burn_stack (stack_burn_depth);
p = hd->bctx.buf;
#define X(a) do { *(u64*)p = be_bswap64(hd->state.h##a) ; p += 8; } while (0)
X (0);
X (1);
X (2);
X (3);
X (4);
X (5);
/* Note that these last two chunks are included even for SHA384.
We just ignore them. */
X (6);
X (7);
#undef X
}
static byte *
sha512_read (void *context)
{
SHA512_CONTEXT *hd = (SHA512_CONTEXT *) context;
return hd->bctx.buf;
}
/*
Self-test section.
*/
static gpg_err_code_t
selftests_sha384 (int extended, selftest_report_func_t report)
{
const char *what;
const char *errtxt;
what = "short string";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA384, 0,
"abc", 3,
"\xcb\x00\x75\x3f\x45\xa3\x5e\x8b\xb5\xa0\x3d\x69\x9a\xc6\x50\x07"
"\x27\x2c\x32\xab\x0e\xde\xd1\x63\x1a\x8b\x60\x5a\x43\xff\x5b\xed"
"\x80\x86\x07\x2b\xa1\xe7\xcc\x23\x58\xba\xec\xa1\x34\xc8\x25\xa7", 48);
if (errtxt)
goto failed;
if (extended)
{
what = "long string";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA384, 0,
"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
"hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
"\x09\x33\x0C\x33\xF7\x11\x47\xE8\x3D\x19\x2F\xC7\x82\xCD\x1B\x47"
"\x53\x11\x1B\x17\x3B\x3B\x05\xD2\x2F\xA0\x80\x86\xE3\xB0\xF7\x12"
"\xFC\xC7\xC7\x1A\x55\x7E\x2D\xB9\x66\xC3\xE9\xFA\x91\x74\x60\x39",
48);
if (errtxt)
goto failed;
what = "one million \"a\"";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA384, 1,
NULL, 0,
"\x9D\x0E\x18\x09\x71\x64\x74\xCB\x08\x6E\x83\x4E\x31\x0A\x4A\x1C"
"\xED\x14\x9E\x9C\x00\xF2\x48\x52\x79\x72\xCE\xC5\x70\x4C\x2A\x5B"
"\x07\xB8\xB3\xDC\x38\xEC\xC4\xEB\xAE\x97\xDD\xD8\x7F\x3D\x89\x85",
48);
if (errtxt)
goto failed;
}
return 0; /* Succeeded. */
failed:
if (report)
report ("digest", GCRY_MD_SHA384, what, errtxt);
return GPG_ERR_SELFTEST_FAILED;
}
static gpg_err_code_t
selftests_sha512 (int extended, selftest_report_func_t report)
{
const char *what;
const char *errtxt;
what = "short string";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA512, 0,
"abc", 3,
"\xDD\xAF\x35\xA1\x93\x61\x7A\xBA\xCC\x41\x73\x49\xAE\x20\x41\x31"
"\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A"
"\x21\x92\x99\x2A\x27\x4F\xC1\xA8\x36\xBA\x3C\x23\xA3\xFE\xEB\xBD"
"\x45\x4D\x44\x23\x64\x3C\xE8\x0E\x2A\x9A\xC9\x4F\xA5\x4C\xA4\x9F", 64);
if (errtxt)
goto failed;
if (extended)
{
what = "long string";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA512, 0,
"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
"hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
"\x8E\x95\x9B\x75\xDA\xE3\x13\xDA\x8C\xF4\xF7\x28\x14\xFC\x14\x3F"
"\x8F\x77\x79\xC6\xEB\x9F\x7F\xA1\x72\x99\xAE\xAD\xB6\x88\x90\x18"
"\x50\x1D\x28\x9E\x49\x00\xF7\xE4\x33\x1B\x99\xDE\xC4\xB5\x43\x3A"
"\xC7\xD3\x29\xEE\xB6\xDD\x26\x54\x5E\x96\xE5\x5B\x87\x4B\xE9\x09",
64);
if (errtxt)
goto failed;
what = "one million \"a\"";
errtxt = _gcry_hash_selftest_check_one
(GCRY_MD_SHA512, 1,
NULL, 0,
"\xE7\x18\x48\x3D\x0C\xE7\x69\x64\x4E\x2E\x42\xC7\xBC\x15\xB4\x63"
"\x8E\x1F\x98\xB1\x3B\x20\x44\x28\x56\x32\xA8\x03\xAF\xA9\x73\xEB"
"\xDE\x0F\xF2\x44\x87\x7E\xA6\x0A\x4C\xB0\x43\x2C\xE5\x77\xC3\x1B"
"\xEB\x00\x9C\x5C\x2C\x49\xAA\x2E\x4E\xAD\xB2\x17\xAD\x8C\xC0\x9B",
64);
if (errtxt)
goto failed;
}
return 0; /* Succeeded. */
failed:
if (report)
report ("digest", GCRY_MD_SHA512, what, errtxt);
return GPG_ERR_SELFTEST_FAILED;
}
/* Run a full self-test for ALGO and return 0 on success. */
static gpg_err_code_t
run_selftests (int algo, int extended, selftest_report_func_t report)
{
gpg_err_code_t ec;
switch (algo)
{
case GCRY_MD_SHA384:
ec = selftests_sha384 (extended, report);
break;
case GCRY_MD_SHA512:
ec = selftests_sha512 (extended, report);
break;
default:
ec = GPG_ERR_DIGEST_ALGO;
break;
}
return ec;
}
static byte sha512_asn[] = /* Object ID is 2.16.840.1.101.3.4.2.3 */
{
0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, 0x05,
0x00, 0x04, 0x40
};
static gcry_md_oid_spec_t oid_spec_sha512[] =
{
{ "2.16.840.1.101.3.4.2.3" },
/* PKCS#1 sha512WithRSAEncryption */
{ "1.2.840.113549.1.1.13" },
{ NULL }
};
gcry_md_spec_t _gcry_digest_spec_sha512 =
{
GCRY_MD_SHA512, {0, 1},
"SHA512", sha512_asn, DIM (sha512_asn), oid_spec_sha512, 64,
sha512_init, _gcry_md_block_write, sha512_final, sha512_read,
sizeof (SHA512_CONTEXT),
run_selftests
};
static byte sha384_asn[] = /* Object ID is 2.16.840.1.101.3.4.2.2 */
{
0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, 0x05,
0x00, 0x04, 0x30
};
static gcry_md_oid_spec_t oid_spec_sha384[] =
{
{ "2.16.840.1.101.3.4.2.2" },
/* PKCS#1 sha384WithRSAEncryption */
{ "1.2.840.113549.1.1.12" },
{ NULL },
};
gcry_md_spec_t _gcry_digest_spec_sha384 =
{
GCRY_MD_SHA384, {0, 1},
"SHA384", sha384_asn, DIM (sha384_asn), oid_spec_sha384, 48,
sha384_init, _gcry_md_block_write, sha512_final, sha512_read,
sizeof (SHA512_CONTEXT),
run_selftests
};