diff --git a/cipher/sha1.c b/cipher/sha1.c index e50262ff..76c486c7 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -1,671 +1,664 @@ /* sha1.c - SHA1 hash function * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* Test vectors: * * "abc" * A999 3E36 4706 816A BA3E 2571 7850 C26C 9CD0 D89D * * "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" * 8498 3E44 1C3B D26E BAAE 4AA1 F951 29E5 E546 70F1 */ #include #include #include #include #ifdef HAVE_STDINT_H # include #endif #include "g10lib.h" #include "bithelp.h" #include "bufhelp.h" #include "cipher.h" #include "sha1.h" /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX indicates whether to compile with Intel AVX code. */ #undef USE_AVX #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX 1 #endif /* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */ #undef USE_BMI2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ defined(HAVE_GCC_INLINE_ASM_BMI2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_BMI2 1 #endif /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */ #undef USE_SHAEXT #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \ defined(HAVE_GCC_INLINE_ASM_SSE41) && \ defined(ENABLE_SHAEXT_SUPPORT) # define USE_SHAEXT 1 #endif /* USE_NEON indicates whether to enable ARM NEON assembly code. */ #undef USE_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_NEON 1 # endif #endif /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly * code. */ #undef USE_ARM_CE #ifdef ENABLE_ARM_CRYPTO_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) # define USE_ARM_CE 1 # elif defined(__AARCH64EL__) \ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) # define USE_ARM_CE 1 # endif #endif /* A macro to test whether P is properly aligned for an u32 type. Note that config.h provides a suitable replacement for uintptr_t if it does not exist in stdint.h. */ /* #if __GNUC__ >= 2 */ /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % __alignof__ (u32))) */ /* #else */ /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % sizeof (u32))) */ /* #endif */ + +/* Assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#undef ASM_EXTRA_STACK +#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \ + defined(USE_SHAEXT) +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4) +# else +# define ASM_FUNC_ABI +# define ASM_EXTRA_STACK 0 +# endif +#endif + + +#ifdef USE_SSSE3 +unsigned int +_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data, + size_t nblks) ASM_FUNC_ABI; + +static unsigned int +do_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks) + + ASM_EXTRA_STACK; +} +#endif + +#ifdef USE_AVX +unsigned int +_gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data, + size_t nblks) ASM_FUNC_ABI; + +static unsigned int +do_sha1_transform_amd64_avx (void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks) + + ASM_EXTRA_STACK; +} +#endif + +#ifdef USE_BMI2 +unsigned int +_gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data, + size_t nblks) ASM_FUNC_ABI; + +static unsigned int +do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks) + + ASM_EXTRA_STACK; +} +#endif + +#ifdef USE_SHAEXT +/* Does not need ASM_FUNC_ABI */ +unsigned int +_gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data, + size_t nblks); + static unsigned int -transform (void *c, const unsigned char *data, size_t nblks); +do_sha1_transform_intel_shaext (void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + return _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks); +} +#endif + +#ifdef USE_NEON +unsigned int +_gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data, + size_t nblks); + +static unsigned int +do_sha1_transform_armv7_neon (void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks); +} +#endif + +#ifdef USE_ARM_CE +unsigned int +_gcry_sha1_transform_armv8_ce (void *state, const unsigned char *data, + size_t nblks); + +static unsigned int +do_sha1_transform_armv8_ce (void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + return _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks); +} +#endif + + +static unsigned int +do_transform_generic (void *c, const unsigned char *data, size_t nblks); static void sha1_init (void *context, unsigned int flags) { SHA1_CONTEXT *hd = context; unsigned int features = _gcry_get_hw_features (); (void)flags; hd->h0 = 0x67452301; hd->h1 = 0xefcdab89; hd->h2 = 0x98badcfe; hd->h3 = 0x10325476; hd->h4 = 0xc3d2e1f0; hd->bctx.nblocks = 0; hd->bctx.nblocks_high = 0; hd->bctx.count = 0; hd->bctx.blocksize = 64; - hd->bctx.bwrite = transform; + /* Order of feature checks is important here; last match will be + * selected. Keep slower implementations at the top and faster at + * the bottom. */ + hd->bctx.bwrite = do_transform_generic; #ifdef USE_SSSE3 - hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; + if ((features & HWF_INTEL_SSSE3) != 0) + hd->bctx.bwrite = do_sha1_transform_amd64_ssse3; #endif #ifdef USE_AVX /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. * Therefore use this implementation on Intel CPUs only. */ - hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); + if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD)) + hd->bctx.bwrite = do_sha1_transform_amd64_avx; #endif #ifdef USE_BMI2 - hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2); + if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2)) + hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2; #endif #ifdef USE_SHAEXT - hd->use_shaext = (features & HWF_INTEL_SHAEXT) - && (features & HWF_INTEL_SSE4_1); + if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1)) + hd->bctx.bwrite = do_sha1_transform_intel_shaext; #endif #ifdef USE_NEON - hd->use_neon = (features & HWF_ARM_NEON) != 0; + if ((features & HWF_ARM_NEON) != 0) + hd->bctx.bwrite = do_sha1_transform_armv7_neon; #endif #ifdef USE_ARM_CE - hd->use_arm_ce = (features & HWF_ARM_SHA1) != 0; + if ((features & HWF_ARM_SHA1) != 0) + hd->bctx.bwrite = do_sha1_transform_armv8_ce; #endif + (void)features; } /* * Initialize the context HD. This is used to prepare the use of * _gcry_sha1_mixblock. WARNING: This is a special purpose function * for exclusive use by random-csprng.c. */ void _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd) { sha1_init (hd, 0); } /* Round function macros. */ #define K1 0x5A827999L #define K2 0x6ED9EBA1L #define K3 0x8F1BBCDCL #define K4 0xCA62C1D6L #define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) ) #define F2(x,y,z) ( x ^ y ^ z ) #define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) ) #define F4(x,y,z) ( x ^ y ^ z ) #define M(i) ( tm = x[ i &0x0f] \ ^ x[(i-14)&0x0f] \ ^ x[(i-8) &0x0f] \ ^ x[(i-3) &0x0f], \ (x[i&0x0f] = rol(tm, 1))) #define R(a,b,c,d,e,f,k,m) do { e += rol( a, 5 ) \ + f( b, c, d ) \ + k \ + m; \ b = rol( b, 30 ); \ } while(0) - -#ifdef USE_NEON -unsigned int -_gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data, - size_t nblks); -#endif - -#ifdef USE_ARM_CE -unsigned int -_gcry_sha1_transform_armv8_ce (void *state, const unsigned char *data, - size_t nblks); -#endif - /* * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA. */ static unsigned int -transform_blk (void *ctx, const unsigned char *data) +do_transform_generic (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; - const u32 *idata = (const void *)data; - register u32 a, b, c, d, e; /* Local copies of the chaining variables. */ - register u32 tm; /* Helper. */ - u32 x[16]; /* The array we work on. */ + + do + { + const u32 *idata = (const void *)data; + u32 a, b, c, d, e; /* Local copies of the chaining variables. */ + u32 tm; /* Helper. */ + u32 x[16]; /* The array we work on. */ #define I(i) (x[i] = buf_get_be32(idata + i)) /* Get the values of the chaining variables. */ a = hd->h0; b = hd->h1; c = hd->h2; d = hd->h3; e = hd->h4; /* Transform. */ R( a, b, c, d, e, F1, K1, I( 0) ); R( e, a, b, c, d, F1, K1, I( 1) ); R( d, e, a, b, c, F1, K1, I( 2) ); R( c, d, e, a, b, F1, K1, I( 3) ); R( b, c, d, e, a, F1, K1, I( 4) ); R( a, b, c, d, e, F1, K1, I( 5) ); R( e, a, b, c, d, F1, K1, I( 6) ); R( d, e, a, b, c, F1, K1, I( 7) ); R( c, d, e, a, b, F1, K1, I( 8) ); R( b, c, d, e, a, F1, K1, I( 9) ); R( a, b, c, d, e, F1, K1, I(10) ); R( e, a, b, c, d, F1, K1, I(11) ); R( d, e, a, b, c, F1, K1, I(12) ); R( c, d, e, a, b, F1, K1, I(13) ); R( b, c, d, e, a, F1, K1, I(14) ); R( a, b, c, d, e, F1, K1, I(15) ); R( e, a, b, c, d, F1, K1, M(16) ); R( d, e, a, b, c, F1, K1, M(17) ); R( c, d, e, a, b, F1, K1, M(18) ); R( b, c, d, e, a, F1, K1, M(19) ); R( a, b, c, d, e, F2, K2, M(20) ); R( e, a, b, c, d, F2, K2, M(21) ); R( d, e, a, b, c, F2, K2, M(22) ); R( c, d, e, a, b, F2, K2, M(23) ); R( b, c, d, e, a, F2, K2, M(24) ); R( a, b, c, d, e, F2, K2, M(25) ); R( e, a, b, c, d, F2, K2, M(26) ); R( d, e, a, b, c, F2, K2, M(27) ); R( c, d, e, a, b, F2, K2, M(28) ); R( b, c, d, e, a, F2, K2, M(29) ); R( a, b, c, d, e, F2, K2, M(30) ); R( e, a, b, c, d, F2, K2, M(31) ); R( d, e, a, b, c, F2, K2, M(32) ); R( c, d, e, a, b, F2, K2, M(33) ); R( b, c, d, e, a, F2, K2, M(34) ); R( a, b, c, d, e, F2, K2, M(35) ); R( e, a, b, c, d, F2, K2, M(36) ); R( d, e, a, b, c, F2, K2, M(37) ); R( c, d, e, a, b, F2, K2, M(38) ); R( b, c, d, e, a, F2, K2, M(39) ); R( a, b, c, d, e, F3, K3, M(40) ); R( e, a, b, c, d, F3, K3, M(41) ); R( d, e, a, b, c, F3, K3, M(42) ); R( c, d, e, a, b, F3, K3, M(43) ); R( b, c, d, e, a, F3, K3, M(44) ); R( a, b, c, d, e, F3, K3, M(45) ); R( e, a, b, c, d, F3, K3, M(46) ); R( d, e, a, b, c, F3, K3, M(47) ); R( c, d, e, a, b, F3, K3, M(48) ); R( b, c, d, e, a, F3, K3, M(49) ); R( a, b, c, d, e, F3, K3, M(50) ); R( e, a, b, c, d, F3, K3, M(51) ); R( d, e, a, b, c, F3, K3, M(52) ); R( c, d, e, a, b, F3, K3, M(53) ); R( b, c, d, e, a, F3, K3, M(54) ); R( a, b, c, d, e, F3, K3, M(55) ); R( e, a, b, c, d, F3, K3, M(56) ); R( d, e, a, b, c, F3, K3, M(57) ); R( c, d, e, a, b, F3, K3, M(58) ); R( b, c, d, e, a, F3, K3, M(59) ); R( a, b, c, d, e, F4, K4, M(60) ); R( e, a, b, c, d, F4, K4, M(61) ); R( d, e, a, b, c, F4, K4, M(62) ); R( c, d, e, a, b, F4, K4, M(63) ); R( b, c, d, e, a, F4, K4, M(64) ); R( a, b, c, d, e, F4, K4, M(65) ); R( e, a, b, c, d, F4, K4, M(66) ); R( d, e, a, b, c, F4, K4, M(67) ); R( c, d, e, a, b, F4, K4, M(68) ); R( b, c, d, e, a, F4, K4, M(69) ); R( a, b, c, d, e, F4, K4, M(70) ); R( e, a, b, c, d, F4, K4, M(71) ); R( d, e, a, b, c, F4, K4, M(72) ); R( c, d, e, a, b, F4, K4, M(73) ); R( b, c, d, e, a, F4, K4, M(74) ); R( a, b, c, d, e, F4, K4, M(75) ); R( e, a, b, c, d, F4, K4, M(76) ); R( d, e, a, b, c, F4, K4, M(77) ); R( c, d, e, a, b, F4, K4, M(78) ); R( b, c, d, e, a, F4, K4, M(79) ); /* Update the chaining variables. */ hd->h0 += a; hd->h1 += b; hd->h2 += c; hd->h3 += d; hd->h4 += e; - return /* burn_stack */ 88+4*sizeof(void*); -} - - -/* Assembly implementations use SystemV ABI, ABI conversion and additional - * stack to store XMM6-XMM15 needed on Win64. */ -#undef ASM_FUNC_ABI -#undef ASM_EXTRA_STACK -#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \ - defined(USE_SHAEXT) -# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS -# define ASM_FUNC_ABI __attribute__((sysv_abi)) -# define ASM_EXTRA_STACK (10 * 16) -# else -# define ASM_FUNC_ABI -# define ASM_EXTRA_STACK 0 -# endif -#endif - - -#ifdef USE_SSSE3 -unsigned int -_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data, - size_t nblks) ASM_FUNC_ABI; -#endif - -#ifdef USE_AVX -unsigned int -_gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data, - size_t nblks) ASM_FUNC_ABI; -#endif - -#ifdef USE_BMI2 -unsigned int -_gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data, - size_t nblks) ASM_FUNC_ABI; -#endif - -#ifdef USE_SHAEXT -/* Does not need ASM_FUNC_ABI */ -unsigned int -_gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data, - size_t nblks); -#endif - - -static unsigned int -transform (void *ctx, const unsigned char *data, size_t nblks) -{ - SHA1_CONTEXT *hd = ctx; - unsigned int burn; - -#ifdef USE_SHAEXT - if (hd->use_shaext) - { - burn = _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif -#ifdef USE_BMI2 - if (hd->use_bmi2) - { - burn = _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif -#ifdef USE_AVX - if (hd->use_avx) - { - burn = _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif -#ifdef USE_SSSE3 - if (hd->use_ssse3) - { - burn = _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif -#ifdef USE_ARM_CE - if (hd->use_arm_ce) - { - burn = _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) : 0; - return burn; - } -#endif -#ifdef USE_NEON - if (hd->use_neon) - { - burn = _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) : 0; - return burn; - } -#endif - - do - { - burn = transform_blk (hd, data); data += 64; } while (--nblks); -#ifdef ASM_EXTRA_STACK - /* 'transform_blk' is typically inlined and XMM6-XMM15 are stored at - * the prologue of this function. Therefore need to add ASM_EXTRA_STACK to - * here too. - */ - burn += ASM_EXTRA_STACK; -#endif - - return burn; + return 88+4*sizeof(void*); } /* * Apply the SHA-1 transform function on the buffer BLOCKOF64BYTE * which must have a length 64 bytes. BLOCKOF64BYTE must be 32-bit * aligned. Updates the 20 bytes in BLOCKOF64BYTE with its mixed * content. Returns the number of bytes which should be burned on the * stack. You need to use _gcry_sha1_mixblock_init to initialize the * context. * WARNING: This is a special purpose function for exclusive use by * random-csprng.c. */ unsigned int _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte) { u32 *p = blockof64byte; unsigned int nburn; - nburn = transform (hd, blockof64byte, 1); + nburn = (*hd->bctx.bwrite) (hd, blockof64byte, 1); p[0] = hd->h0; p[1] = hd->h1; p[2] = hd->h2; p[3] = hd->h3; p[4] = hd->h4; return nburn; } /* The routine final terminates the computation and * returns the digest. * The handle is prepared for a new cycle, but adding bytes to the * handle will the destroy the returned buffer. * Returns: 20 bytes representing the digest. */ static void sha1_final(void *context) { SHA1_CONTEXT *hd = context; u32 t, th, msb, lsb; unsigned char *p; unsigned int burn; _gcry_md_block_write (hd, NULL, 0); /* flush */; t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; else th = hd->bctx.nblocks >> 32; /* multiply by 64 to make a byte count */ lsb = t << 6; msb = (th << 6) | (t >> 26); /* add the count */ t = lsb; if( (lsb += hd->bctx.count) < t ) msb++; /* multiply by 8 to make a bit count */ t = lsb; lsb <<= 3; msb <<= 3; msb |= t >> 29; if( hd->bctx.count < 56 ) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ while( hd->bctx.count < 56 ) hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ } else /* need one extra block */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ while( hd->bctx.count < 64 ) hd->bctx.buf[hd->bctx.count++] = 0; _gcry_md_block_write(hd, NULL, 0); /* flush */; memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ } /* append the 64 bit count */ buf_put_be32(hd->bctx.buf + 56, msb); buf_put_be32(hd->bctx.buf + 60, lsb); - burn = transform( hd, hd->bctx.buf, 1 ); + burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 ); _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0) X(0); X(1); X(2); X(3); X(4); #undef X } static unsigned char * sha1_read( void *context ) { SHA1_CONTEXT *hd = context; return hd->bctx.buf; } /**************** * Shortcut functions which puts the hash value of the supplied buffer * into outbuf which must have a size of 20 bytes. */ void _gcry_sha1_hash_buffer (void *outbuf, const void *buffer, size_t length) { SHA1_CONTEXT hd; sha1_init (&hd, 0); _gcry_md_block_write (&hd, buffer, length); sha1_final (&hd); memcpy (outbuf, hd.bctx.buf, 20); } /* Variant of the above shortcut function using a multiple buffers. */ void _gcry_sha1_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt) { SHA1_CONTEXT hd; sha1_init (&hd, 0); for (;iovcnt > 0; iov++, iovcnt--) _gcry_md_block_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len); sha1_final (&hd); memcpy (outbuf, hd.bctx.buf, 20); } /* Self-test section. */ static gpg_err_code_t selftests_sha1 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "short string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA1, 0, "abc", 3, "\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E" "\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D", 20); if (errtxt) goto failed; if (extended) { what = "long string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA1, 0, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56, "\x84\x98\x3E\x44\x1C\x3B\xD2\x6E\xBA\xAE" "\x4A\xA1\xF9\x51\x29\xE5\xE5\x46\x70\xF1", 20); if (errtxt) goto failed; what = "one million \"a\""; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA1, 1, NULL, 0, "\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E" "\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F", 20); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("digest", GCRY_MD_SHA1, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t run_selftests (int algo, int extended, selftest_report_func_t report) { gpg_err_code_t ec; switch (algo) { case GCRY_MD_SHA1: ec = selftests_sha1 (extended, report); break; default: ec = GPG_ERR_DIGEST_ALGO; break; } return ec; } static unsigned char asn[15] = /* Object ID is 1.3.14.3.2.26 */ { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, 0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 }; static gcry_md_oid_spec_t oid_spec_sha1[] = { /* iso.member-body.us.rsadsi.pkcs.pkcs-1.5 (sha1WithRSAEncryption) */ { "1.2.840.113549.1.1.5" }, /* iso.member-body.us.x9-57.x9cm.3 (dsaWithSha1)*/ { "1.2.840.10040.4.3" }, /* from NIST's OIW (sha1) */ { "1.3.14.3.2.26" }, /* from NIST OIW (sha-1WithRSAEncryption) */ { "1.3.14.3.2.29" }, /* iso.member-body.us.ansi-x9-62.signatures.ecdsa-with-sha1 */ { "1.2.840.10045.4.1" }, { NULL }, }; gcry_md_spec_t _gcry_digest_spec_sha1 = { GCRY_MD_SHA1, {0, 1}, "SHA1", asn, DIM (asn), oid_spec_sha1, 20, sha1_init, _gcry_md_block_write, sha1_final, sha1_read, NULL, _gcry_sha1_hash_buffer, _gcry_sha1_hash_buffers, sizeof (SHA1_CONTEXT), run_selftests }; diff --git a/cipher/sha1.h b/cipher/sha1.h index 93ce79b5..acf764ba 100644 --- a/cipher/sha1.h +++ b/cipher/sha1.h @@ -1,41 +1,35 @@ /* sha1.h - SHA-1 context definition * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef GCRY_SHA1_H #define GCRY_SHA1_H #include "hash-common.h" /* We need this here for direct use by random-csprng.c. */ typedef struct { gcry_md_block_ctx_t bctx; u32 h0,h1,h2,h3,h4; - unsigned int use_ssse3:1; - unsigned int use_avx:1; - unsigned int use_bmi2:1; - unsigned int use_shaext:1; - unsigned int use_neon:1; - unsigned int use_arm_ce:1; } SHA1_CONTEXT; void _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd); unsigned int _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte); #endif /*GCRY_SHA1_H*/ diff --git a/cipher/sha256.c b/cipher/sha256.c index 06959707..e82a9d90 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -1,788 +1,769 @@ /* sha256.c - SHA256 hash function * Copyright (C) 2003, 2006, 2008, 2009 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* Test vectors: "abc" SHA224: 23097d22 3405d822 8642a477 bda255b3 2aadbce4 bda0b3f7 e36c9da7 SHA256: ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" SHA224: 75388b16 512776cc 5dba5da1 fd890150 b0c6455c b4f58b19 52522525 SHA256: 248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1 "a" one million times SHA224: 20794655 980c91d8 bbb4c1ea 97618a4b f03f4258 1948b2ee 4ee7ad67 SHA256: cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0 */ #include #include #include #include #include "g10lib.h" #include "bithelp.h" #include "bufhelp.h" #include "cipher.h" #include "hash-common.h" /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX indicates whether to compile with Intel AVX code. */ #undef USE_AVX #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */ #undef USE_AVX2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \ defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX2 1 #endif /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */ #undef USE_SHAEXT #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \ defined(HAVE_GCC_INLINE_ASM_SSE41) && \ defined(ENABLE_SHAEXT_SUPPORT) # define USE_SHAEXT 1 #endif /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly * code. */ #undef USE_ARM_CE #ifdef ENABLE_ARM_CRYPTO_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) # define USE_ARM_CE 1 # elif defined(__AARCH64EL__) \ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) # define USE_ARM_CE 1 # endif #endif typedef struct { gcry_md_block_ctx_t bctx; u32 h0,h1,h2,h3,h4,h5,h6,h7; +} SHA256_CONTEXT; + + +/* Assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#undef ASM_EXTRA_STACK +#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \ + defined(USE_SHAEXT) +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4) +# else +# define ASM_FUNC_ABI +# define ASM_EXTRA_STACK 0 +# endif +#endif + + #ifdef USE_SSSE3 - unsigned int use_ssse3:1; +unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data, + u32 state[8], + size_t num_blks) ASM_FUNC_ABI; + +static unsigned int +do_sha256_transform_amd64_ssse3(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA256_CONTEXT *hd = ctx; + return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks) + + ASM_EXTRA_STACK; +} #endif + #ifdef USE_AVX - unsigned int use_avx:1; +unsigned int _gcry_sha256_transform_amd64_avx(const void *input_data, + u32 state[8], + size_t num_blks) ASM_FUNC_ABI; + +static unsigned int +do_sha256_transform_amd64_avx(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA256_CONTEXT *hd = ctx; + return _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks) + + ASM_EXTRA_STACK; +} #endif + #ifdef USE_AVX2 - unsigned int use_avx2:1; +unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data, + u32 state[8], + size_t num_blks) ASM_FUNC_ABI; + +static unsigned int +do_sha256_transform_amd64_avx2(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA256_CONTEXT *hd = ctx; + return _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks) + + ASM_EXTRA_STACK; +} #endif + #ifdef USE_SHAEXT - unsigned int use_shaext:1; +/* Does not need ASM_FUNC_ABI */ +unsigned int +_gcry_sha256_transform_intel_shaext(u32 state[8], + const unsigned char *input_data, + size_t num_blks); + +static unsigned int +do_sha256_transform_intel_shaext(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA256_CONTEXT *hd = ctx; + return _gcry_sha256_transform_intel_shaext (&hd->h0, data, nblks); +} #endif + #ifdef USE_ARM_CE - unsigned int use_arm_ce:1; +unsigned int _gcry_sha256_transform_armv8_ce(u32 state[8], + const void *input_data, + size_t num_blks); + +static unsigned int +do_sha256_transform_armv8_ce(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA256_CONTEXT *hd = ctx; + return _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks); +} #endif -} SHA256_CONTEXT; static unsigned int -transform (void *c, const unsigned char *data, size_t nblks); +do_transform_generic (void *ctx, const unsigned char *data, size_t nblks); static void sha256_init (void *context, unsigned int flags) { SHA256_CONTEXT *hd = context; unsigned int features = _gcry_get_hw_features (); (void)flags; hd->h0 = 0x6a09e667; hd->h1 = 0xbb67ae85; hd->h2 = 0x3c6ef372; hd->h3 = 0xa54ff53a; hd->h4 = 0x510e527f; hd->h5 = 0x9b05688c; hd->h6 = 0x1f83d9ab; hd->h7 = 0x5be0cd19; hd->bctx.nblocks = 0; hd->bctx.nblocks_high = 0; hd->bctx.count = 0; hd->bctx.blocksize = 64; - hd->bctx.bwrite = transform; + /* Order of feature checks is important here; last match will be + * selected. Keep slower implementations at the top and faster at + * the bottom. */ + hd->bctx.bwrite = do_transform_generic; #ifdef USE_SSSE3 - hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; + if ((features & HWF_INTEL_SSSE3) != 0) + hd->bctx.bwrite = do_sha256_transform_amd64_ssse3; #endif #ifdef USE_AVX /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. * Therefore use this implementation on Intel CPUs only. */ - hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); + if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD)) + hd->bctx.bwrite = do_sha256_transform_amd64_avx; #endif #ifdef USE_AVX2 - hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); + if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2)) + hd->bctx.bwrite = do_sha256_transform_amd64_avx2; #endif #ifdef USE_SHAEXT - hd->use_shaext = (features & HWF_INTEL_SHAEXT) - && (features & HWF_INTEL_SSE4_1); + if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1)) + hd->bctx.bwrite = do_sha256_transform_intel_shaext; #endif #ifdef USE_ARM_CE - hd->use_arm_ce = (features & HWF_ARM_SHA2) != 0; + if ((features & HWF_ARM_SHA2) != 0) + hd->bctx.bwrite = do_sha256_transform_armv8_ce; #endif (void)features; } static void sha224_init (void *context, unsigned int flags) { SHA256_CONTEXT *hd = context; unsigned int features = _gcry_get_hw_features (); (void)flags; hd->h0 = 0xc1059ed8; hd->h1 = 0x367cd507; hd->h2 = 0x3070dd17; hd->h3 = 0xf70e5939; hd->h4 = 0xffc00b31; hd->h5 = 0x68581511; hd->h6 = 0x64f98fa7; hd->h7 = 0xbefa4fa4; hd->bctx.nblocks = 0; hd->bctx.nblocks_high = 0; hd->bctx.count = 0; hd->bctx.blocksize = 64; - hd->bctx.bwrite = transform; + /* Order of feature checks is important here; last match will be + * selected. Keep slower implementations at the top and faster at + * the bottom. */ + hd->bctx.bwrite = do_transform_generic; #ifdef USE_SSSE3 - hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; + if ((features & HWF_INTEL_SSSE3) != 0) + hd->bctx.bwrite = do_sha256_transform_amd64_ssse3; #endif #ifdef USE_AVX /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. * Therefore use this implementation on Intel CPUs only. */ - hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); + if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD)) + hd->bctx.bwrite = do_sha256_transform_amd64_avx; #endif #ifdef USE_AVX2 - hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); + if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2)) + hd->bctx.bwrite = do_sha256_transform_amd64_avx2; #endif #ifdef USE_SHAEXT - hd->use_shaext = (features & HWF_INTEL_SHAEXT) - && (features & HWF_INTEL_SSE4_1); + if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1)) + hd->bctx.bwrite = do_sha256_transform_intel_shaext; #endif #ifdef USE_ARM_CE - hd->use_arm_ce = (features & HWF_ARM_SHA2) != 0; + if ((features & HWF_ARM_SHA2) != 0) + hd->bctx.bwrite = do_sha256_transform_armv8_ce; #endif (void)features; } /* Transform the message X which consists of 16 32-bit-words. See FIPS 180-2 for details. */ #define R(a,b,c,d,e,f,g,h,k,w) do \ { \ t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + (k) + (w); \ t2 = Sum0((a)) + Maj((a),(b),(c)); \ d += t1; \ h = t1 + t2; \ } while (0) /* (4.2) same as SHA-1's F1. */ #define Cho(x, y, z) (z ^ (x & (y ^ z))) /* (4.3) same as SHA-1's F3 */ #define Maj(x, y, z) ((x & y) + (z & (x ^ y))) /* (4.4) */ #define Sum0(x) (ror (x, 2) ^ ror (x, 13) ^ ror (x, 22)) /* (4.5) */ #define Sum1(x) (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25)) /* Message expansion */ #define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3)) /* (4.6) */ #define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10)) /* (4.7) */ #define I(i) ( w[i] = buf_get_be32(data + i * 4) ) #define W(i) ( w[i&0x0f] = S1(w[(i-2) &0x0f]) \ + w[(i-7) &0x0f] \ + S0(w[(i-15)&0x0f]) \ + w[(i-16)&0x0f] ) static unsigned int -transform_blk (void *ctx, const unsigned char *data) +do_transform_generic (void *ctx, const unsigned char *data, size_t nblks) { SHA256_CONTEXT *hd = ctx; static const u32 K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }; - u32 a,b,c,d,e,f,g,h,t1,t2; - u32 w[16]; - - a = hd->h0; - b = hd->h1; - c = hd->h2; - d = hd->h3; - e = hd->h4; - f = hd->h5; - g = hd->h6; - h = hd->h7; - - R(a, b, c, d, e, f, g, h, K[0], I(0)); - R(h, a, b, c, d, e, f, g, K[1], I(1)); - R(g, h, a, b, c, d, e, f, K[2], I(2)); - R(f, g, h, a, b, c, d, e, K[3], I(3)); - R(e, f, g, h, a, b, c, d, K[4], I(4)); - R(d, e, f, g, h, a, b, c, K[5], I(5)); - R(c, d, e, f, g, h, a, b, K[6], I(6)); - R(b, c, d, e, f, g, h, a, K[7], I(7)); - R(a, b, c, d, e, f, g, h, K[8], I(8)); - R(h, a, b, c, d, e, f, g, K[9], I(9)); - R(g, h, a, b, c, d, e, f, K[10], I(10)); - R(f, g, h, a, b, c, d, e, K[11], I(11)); - R(e, f, g, h, a, b, c, d, K[12], I(12)); - R(d, e, f, g, h, a, b, c, K[13], I(13)); - R(c, d, e, f, g, h, a, b, K[14], I(14)); - R(b, c, d, e, f, g, h, a, K[15], I(15)); - - R(a, b, c, d, e, f, g, h, K[16], W(16)); - R(h, a, b, c, d, e, f, g, K[17], W(17)); - R(g, h, a, b, c, d, e, f, K[18], W(18)); - R(f, g, h, a, b, c, d, e, K[19], W(19)); - R(e, f, g, h, a, b, c, d, K[20], W(20)); - R(d, e, f, g, h, a, b, c, K[21], W(21)); - R(c, d, e, f, g, h, a, b, K[22], W(22)); - R(b, c, d, e, f, g, h, a, K[23], W(23)); - R(a, b, c, d, e, f, g, h, K[24], W(24)); - R(h, a, b, c, d, e, f, g, K[25], W(25)); - R(g, h, a, b, c, d, e, f, K[26], W(26)); - R(f, g, h, a, b, c, d, e, K[27], W(27)); - R(e, f, g, h, a, b, c, d, K[28], W(28)); - R(d, e, f, g, h, a, b, c, K[29], W(29)); - R(c, d, e, f, g, h, a, b, K[30], W(30)); - R(b, c, d, e, f, g, h, a, K[31], W(31)); - - R(a, b, c, d, e, f, g, h, K[32], W(32)); - R(h, a, b, c, d, e, f, g, K[33], W(33)); - R(g, h, a, b, c, d, e, f, K[34], W(34)); - R(f, g, h, a, b, c, d, e, K[35], W(35)); - R(e, f, g, h, a, b, c, d, K[36], W(36)); - R(d, e, f, g, h, a, b, c, K[37], W(37)); - R(c, d, e, f, g, h, a, b, K[38], W(38)); - R(b, c, d, e, f, g, h, a, K[39], W(39)); - R(a, b, c, d, e, f, g, h, K[40], W(40)); - R(h, a, b, c, d, e, f, g, K[41], W(41)); - R(g, h, a, b, c, d, e, f, K[42], W(42)); - R(f, g, h, a, b, c, d, e, K[43], W(43)); - R(e, f, g, h, a, b, c, d, K[44], W(44)); - R(d, e, f, g, h, a, b, c, K[45], W(45)); - R(c, d, e, f, g, h, a, b, K[46], W(46)); - R(b, c, d, e, f, g, h, a, K[47], W(47)); - - R(a, b, c, d, e, f, g, h, K[48], W(48)); - R(h, a, b, c, d, e, f, g, K[49], W(49)); - R(g, h, a, b, c, d, e, f, K[50], W(50)); - R(f, g, h, a, b, c, d, e, K[51], W(51)); - R(e, f, g, h, a, b, c, d, K[52], W(52)); - R(d, e, f, g, h, a, b, c, K[53], W(53)); - R(c, d, e, f, g, h, a, b, K[54], W(54)); - R(b, c, d, e, f, g, h, a, K[55], W(55)); - R(a, b, c, d, e, f, g, h, K[56], W(56)); - R(h, a, b, c, d, e, f, g, K[57], W(57)); - R(g, h, a, b, c, d, e, f, K[58], W(58)); - R(f, g, h, a, b, c, d, e, K[59], W(59)); - R(e, f, g, h, a, b, c, d, K[60], W(60)); - R(d, e, f, g, h, a, b, c, K[61], W(61)); - R(c, d, e, f, g, h, a, b, K[62], W(62)); - R(b, c, d, e, f, g, h, a, K[63], W(63)); - - hd->h0 += a; - hd->h1 += b; - hd->h2 += c; - hd->h3 += d; - hd->h4 += e; - hd->h5 += f; - hd->h6 += g; - hd->h7 += h; - - return /*burn_stack*/ 26*4+32; -} -#undef S0 -#undef S1 -#undef R - - -/* Assembly implementations use SystemV ABI, ABI conversion and additional - * stack to store XMM6-XMM15 needed on Win64. */ -#undef ASM_FUNC_ABI -#undef ASM_EXTRA_STACK -#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \ - defined(USE_SHAEXT) -# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS -# define ASM_FUNC_ABI __attribute__((sysv_abi)) -# define ASM_EXTRA_STACK (10 * 16) -# else -# define ASM_FUNC_ABI -# define ASM_EXTRA_STACK 0 -# endif -#endif - - -#ifdef USE_SSSE3 -unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data, - u32 state[8], - size_t num_blks) ASM_FUNC_ABI; -#endif - -#ifdef USE_AVX -unsigned int _gcry_sha256_transform_amd64_avx(const void *input_data, - u32 state[8], - size_t num_blks) ASM_FUNC_ABI; -#endif - -#ifdef USE_AVX2 -unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data, - u32 state[8], - size_t num_blks) ASM_FUNC_ABI; -#endif - -#ifdef USE_SHAEXT -/* Does not need ASM_FUNC_ABI */ -unsigned int -_gcry_sha256_transform_intel_shaext(u32 state[8], - const unsigned char *input_data, - size_t num_blks); -#endif - -#ifdef USE_ARM_CE -unsigned int _gcry_sha256_transform_armv8_ce(u32 state[8], - const void *input_data, - size_t num_blks); -#endif - -static unsigned int -transform (void *ctx, const unsigned char *data, size_t nblks) -{ - SHA256_CONTEXT *hd = ctx; - unsigned int burn; - -#ifdef USE_SHAEXT - if (hd->use_shaext) - { - burn = _gcry_sha256_transform_intel_shaext (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif - -#ifdef USE_AVX2 - if (hd->use_avx2) - { - burn = _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif - -#ifdef USE_AVX - if (hd->use_avx) - { - burn = _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif - -#ifdef USE_SSSE3 - if (hd->use_ssse3) + do { - burn = _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks); - burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0; - return burn; - } -#endif -#ifdef USE_ARM_CE - if (hd->use_arm_ce) - { - burn = _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks); - burn += burn ? 4 * sizeof(void*) : 0; - return burn; - } -#endif + u32 a,b,c,d,e,f,g,h,t1,t2; + u32 w[16]; + + a = hd->h0; + b = hd->h1; + c = hd->h2; + d = hd->h3; + e = hd->h4; + f = hd->h5; + g = hd->h6; + h = hd->h7; + + R(a, b, c, d, e, f, g, h, K[0], I(0)); + R(h, a, b, c, d, e, f, g, K[1], I(1)); + R(g, h, a, b, c, d, e, f, K[2], I(2)); + R(f, g, h, a, b, c, d, e, K[3], I(3)); + R(e, f, g, h, a, b, c, d, K[4], I(4)); + R(d, e, f, g, h, a, b, c, K[5], I(5)); + R(c, d, e, f, g, h, a, b, K[6], I(6)); + R(b, c, d, e, f, g, h, a, K[7], I(7)); + R(a, b, c, d, e, f, g, h, K[8], I(8)); + R(h, a, b, c, d, e, f, g, K[9], I(9)); + R(g, h, a, b, c, d, e, f, K[10], I(10)); + R(f, g, h, a, b, c, d, e, K[11], I(11)); + R(e, f, g, h, a, b, c, d, K[12], I(12)); + R(d, e, f, g, h, a, b, c, K[13], I(13)); + R(c, d, e, f, g, h, a, b, K[14], I(14)); + R(b, c, d, e, f, g, h, a, K[15], I(15)); + + R(a, b, c, d, e, f, g, h, K[16], W(16)); + R(h, a, b, c, d, e, f, g, K[17], W(17)); + R(g, h, a, b, c, d, e, f, K[18], W(18)); + R(f, g, h, a, b, c, d, e, K[19], W(19)); + R(e, f, g, h, a, b, c, d, K[20], W(20)); + R(d, e, f, g, h, a, b, c, K[21], W(21)); + R(c, d, e, f, g, h, a, b, K[22], W(22)); + R(b, c, d, e, f, g, h, a, K[23], W(23)); + R(a, b, c, d, e, f, g, h, K[24], W(24)); + R(h, a, b, c, d, e, f, g, K[25], W(25)); + R(g, h, a, b, c, d, e, f, K[26], W(26)); + R(f, g, h, a, b, c, d, e, K[27], W(27)); + R(e, f, g, h, a, b, c, d, K[28], W(28)); + R(d, e, f, g, h, a, b, c, K[29], W(29)); + R(c, d, e, f, g, h, a, b, K[30], W(30)); + R(b, c, d, e, f, g, h, a, K[31], W(31)); + + R(a, b, c, d, e, f, g, h, K[32], W(32)); + R(h, a, b, c, d, e, f, g, K[33], W(33)); + R(g, h, a, b, c, d, e, f, K[34], W(34)); + R(f, g, h, a, b, c, d, e, K[35], W(35)); + R(e, f, g, h, a, b, c, d, K[36], W(36)); + R(d, e, f, g, h, a, b, c, K[37], W(37)); + R(c, d, e, f, g, h, a, b, K[38], W(38)); + R(b, c, d, e, f, g, h, a, K[39], W(39)); + R(a, b, c, d, e, f, g, h, K[40], W(40)); + R(h, a, b, c, d, e, f, g, K[41], W(41)); + R(g, h, a, b, c, d, e, f, K[42], W(42)); + R(f, g, h, a, b, c, d, e, K[43], W(43)); + R(e, f, g, h, a, b, c, d, K[44], W(44)); + R(d, e, f, g, h, a, b, c, K[45], W(45)); + R(c, d, e, f, g, h, a, b, K[46], W(46)); + R(b, c, d, e, f, g, h, a, K[47], W(47)); + + R(a, b, c, d, e, f, g, h, K[48], W(48)); + R(h, a, b, c, d, e, f, g, K[49], W(49)); + R(g, h, a, b, c, d, e, f, K[50], W(50)); + R(f, g, h, a, b, c, d, e, K[51], W(51)); + R(e, f, g, h, a, b, c, d, K[52], W(52)); + R(d, e, f, g, h, a, b, c, K[53], W(53)); + R(c, d, e, f, g, h, a, b, K[54], W(54)); + R(b, c, d, e, f, g, h, a, K[55], W(55)); + R(a, b, c, d, e, f, g, h, K[56], W(56)); + R(h, a, b, c, d, e, f, g, K[57], W(57)); + R(g, h, a, b, c, d, e, f, K[58], W(58)); + R(f, g, h, a, b, c, d, e, K[59], W(59)); + R(e, f, g, h, a, b, c, d, K[60], W(60)); + R(d, e, f, g, h, a, b, c, K[61], W(61)); + R(c, d, e, f, g, h, a, b, K[62], W(62)); + R(b, c, d, e, f, g, h, a, K[63], W(63)); + + hd->h0 += a; + hd->h1 += b; + hd->h2 += c; + hd->h3 += d; + hd->h4 += e; + hd->h5 += f; + hd->h6 += g; + hd->h7 += h; - do - { - burn = transform_blk (hd, data); data += 64; } while (--nblks); -#ifdef ASM_EXTRA_STACK - /* 'transform_blk' is typically inlined and XMM6-XMM15 are stored at - * the prologue of this function. Therefore need to add ASM_EXTRA_STACK to - * here too. - */ - burn += ASM_EXTRA_STACK; -#endif - - return burn; + return 26*4 + 32 + 3 * sizeof(void*); } +#undef S0 +#undef S1 +#undef R + /* The routine finally terminates the computation and returns the digest. The handle is prepared for a new cycle, but adding bytes to the handle will the destroy the returned buffer. Returns: 32 bytes with the message the digest. */ static void sha256_final(void *context) { SHA256_CONTEXT *hd = context; u32 t, th, msb, lsb; byte *p; unsigned int burn; _gcry_md_block_write (hd, NULL, 0); /* flush */; t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; else th = hd->bctx.nblocks >> 32; /* multiply by 64 to make a byte count */ lsb = t << 6; msb = (th << 6) | (t >> 26); /* add the count */ t = lsb; if ((lsb += hd->bctx.count) < t) msb++; /* multiply by 8 to make a bit count */ t = lsb; lsb <<= 3; msb <<= 3; msb |= t >> 29; if (hd->bctx.count < 56) { /* enough room */ hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ while (hd->bctx.count < 56) hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ } else { /* need one extra block */ hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ while (hd->bctx.count < 64) hd->bctx.buf[hd->bctx.count++] = 0; _gcry_md_block_write (hd, NULL, 0); /* flush */; memset (hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ } /* append the 64 bit count */ buf_put_be32(hd->bctx.buf + 56, msb); buf_put_be32(hd->bctx.buf + 60, lsb); - burn = transform (hd, hd->bctx.buf, 1); + burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1); _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0) X(0); X(1); X(2); X(3); X(4); X(5); X(6); X(7); #undef X } static byte * sha256_read (void *context) { SHA256_CONTEXT *hd = context; return hd->bctx.buf; } /* Shortcut functions which puts the hash value of the supplied buffer * into outbuf which must have a size of 32 bytes. */ void _gcry_sha256_hash_buffer (void *outbuf, const void *buffer, size_t length) { SHA256_CONTEXT hd; sha256_init (&hd, 0); _gcry_md_block_write (&hd, buffer, length); sha256_final (&hd); memcpy (outbuf, hd.bctx.buf, 32); } /* Variant of the above shortcut function using multiple buffers. */ void _gcry_sha256_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt) { SHA256_CONTEXT hd; sha256_init (&hd, 0); for (;iovcnt > 0; iov++, iovcnt--) _gcry_md_block_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len); sha256_final (&hd); memcpy (outbuf, hd.bctx.buf, 32); } /* Shortcut functions which puts the hash value of the supplied buffer * into outbuf which must have a size of 28 bytes. */ static void _gcry_sha224_hash_buffer (void *outbuf, const void *buffer, size_t length) { SHA256_CONTEXT hd; sha224_init (&hd, 0); _gcry_md_block_write (&hd, buffer, length); sha256_final (&hd); memcpy (outbuf, hd.bctx.buf, 28); } /* Variant of the above shortcut function using multiple buffers. */ static void _gcry_sha224_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt) { SHA256_CONTEXT hd; sha224_init (&hd, 0); for (;iovcnt > 0; iov++, iovcnt--) _gcry_md_block_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len); sha256_final (&hd); memcpy (outbuf, hd.bctx.buf, 28); } /* Self-test section. */ static gpg_err_code_t selftests_sha224 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "short string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA224, 0, "abc", 3, "\x23\x09\x7d\x22\x34\x05\xd8\x22\x86\x42\xa4\x77\xbd\xa2\x55\xb3" "\x2a\xad\xbc\xe4\xbd\xa0\xb3\xf7\xe3\x6c\x9d\xa7", 28); if (errtxt) goto failed; if (extended) { what = "long string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA224, 0, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56, "\x75\x38\x8b\x16\x51\x27\x76\xcc\x5d\xba\x5d\xa1\xfd\x89\x01\x50" "\xb0\xc6\x45\x5c\xb4\xf5\x8b\x19\x52\x52\x25\x25", 28); if (errtxt) goto failed; what = "one million \"a\""; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA224, 1, NULL, 0, "\x20\x79\x46\x55\x98\x0c\x91\xd8\xbb\xb4\xc1\xea\x97\x61\x8a\x4b" "\xf0\x3f\x42\x58\x19\x48\xb2\xee\x4e\xe7\xad\x67", 28); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("digest", GCRY_MD_SHA224, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } static gpg_err_code_t selftests_sha256 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "short string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA256, 0, "abc", 3, "\xba\x78\x16\xbf\x8f\x01\xcf\xea\x41\x41\x40\xde\x5d\xae\x22\x23" "\xb0\x03\x61\xa3\x96\x17\x7a\x9c\xb4\x10\xff\x61\xf2\x00\x15\xad", 32); if (errtxt) goto failed; if (extended) { what = "long string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA256, 0, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56, "\x24\x8d\x6a\x61\xd2\x06\x38\xb8\xe5\xc0\x26\x93\x0c\x3e\x60\x39" "\xa3\x3c\xe4\x59\x64\xff\x21\x67\xf6\xec\xed\xd4\x19\xdb\x06\xc1", 32); if (errtxt) goto failed; what = "one million \"a\""; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA256, 1, NULL, 0, "\xcd\xc7\x6e\x5c\x99\x14\xfb\x92\x81\xa1\xc7\xe2\x84\xd7\x3e\x67" "\xf1\x80\x9a\x48\xa4\x97\x20\x0e\x04\x6d\x39\xcc\xc7\x11\x2c\xd0", 32); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("digest", GCRY_MD_SHA256, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t run_selftests (int algo, int extended, selftest_report_func_t report) { gpg_err_code_t ec; switch (algo) { case GCRY_MD_SHA224: ec = selftests_sha224 (extended, report); break; case GCRY_MD_SHA256: ec = selftests_sha256 (extended, report); break; default: ec = GPG_ERR_DIGEST_ALGO; break; } return ec; } static byte asn224[19] = /* Object ID is 2.16.840.1.101.3.4.2.4 */ { 0x30, 0x2D, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04, 0x05, 0x00, 0x04, 0x1C }; static gcry_md_oid_spec_t oid_spec_sha224[] = { /* From RFC3874, Section 4 */ { "2.16.840.1.101.3.4.2.4" }, { NULL }, }; static byte asn256[19] = /* Object ID is 2.16.840.1.101.3.4.2.1 */ { 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0x05, 0x00, 0x04, 0x20 }; static gcry_md_oid_spec_t oid_spec_sha256[] = { /* According to the OpenPGP draft rfc2440-bis06 */ { "2.16.840.1.101.3.4.2.1" }, /* PKCS#1 sha256WithRSAEncryption */ { "1.2.840.113549.1.1.11" }, { NULL }, }; gcry_md_spec_t _gcry_digest_spec_sha224 = { GCRY_MD_SHA224, {0, 1}, "SHA224", asn224, DIM (asn224), oid_spec_sha224, 28, sha224_init, _gcry_md_block_write, sha256_final, sha256_read, NULL, _gcry_sha224_hash_buffer, _gcry_sha224_hash_buffers, sizeof (SHA256_CONTEXT), run_selftests }; gcry_md_spec_t _gcry_digest_spec_sha256 = { GCRY_MD_SHA256, {0, 1}, "SHA256", asn256, DIM (asn256), oid_spec_sha256, 32, sha256_init, _gcry_md_block_write, sha256_final, sha256_read, NULL, _gcry_sha256_hash_buffer, _gcry_sha256_hash_buffers, sizeof (SHA256_CONTEXT), run_selftests }; diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S index a9d12724..6596f2cd 100644 --- a/cipher/sha512-armv7-neon.S +++ b/cipher/sha512-armv7-neon.S @@ -1,449 +1,450 @@ /* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_NEON) .text .syntax unified .fpu neon .arm /* structure of SHA512_CONTEXT */ #define hd_a 0 #define hd_b ((hd_a) + 8) #define hd_c ((hd_b) + 8) #define hd_d ((hd_c) + 8) #define hd_e ((hd_d) + 8) #define hd_f ((hd_e) + 8) #define hd_g ((hd_f) + 8) /* register macros */ #define RK %r2 #define RA d0 #define RB d1 #define RC d2 #define RD d3 #define RE d4 #define RF d5 #define RG d6 #define RH d7 #define RT0 d8 #define RT1 d9 #define RT2 d10 #define RT3 d11 #define RT4 d12 #define RT5 d13 #define RT6 d14 #define RT7 d15 #define RT01q q4 #define RT23q q5 #define RT45q q6 #define RT67q q7 #define RW0 d16 #define RW1 d17 #define RW2 d18 #define RW3 d19 #define RW4 d20 #define RW5 d21 #define RW6 d22 #define RW7 d23 #define RW8 d24 #define RW9 d25 #define RW10 d26 #define RW11 d27 #define RW12 d28 #define RW13 d29 #define RW14 d30 #define RW15 d31 #define RW01q q8 #define RW23q q9 #define RW45q q10 #define RW67q q11 #define RW89q q12 #define RW1011q q13 #define RW1213q q14 #define RW1415q q15 /*********************************************************************** * ARM assembly implementation of sha512 transform ***********************************************************************/ #define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ vshr.u64 RT2, re, #14; \ vshl.u64 RT3, re, #64 - 14; \ interleave_op(arg1); \ vshr.u64 RT4, re, #18; \ vshl.u64 RT5, re, #64 - 18; \ vld1.64 {RT0}, [RK]!; \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, re, #41; \ vshl.u64 RT5, re, #64 - 41; \ vadd.u64 RT0, RT0, rw0; \ veor.64 RT23q, RT23q, RT45q; \ vmov.64 RT7, re; \ veor.64 RT1, RT2, RT3; \ vbsl.64 RT7, rf, rg; \ \ vadd.u64 RT1, RT1, rh; \ vshr.u64 RT2, ra, #28; \ vshl.u64 RT3, ra, #64 - 28; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, ra, #34; \ vshl.u64 RT5, ra, #64 - 34; \ vadd.u64 RT1, RT1, RT7; \ \ /* h = Sum0 (a) + Maj (a, b, c); */ \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, ra, #39; \ vshl.u64 RT5, ra, #64 - 39; \ veor.64 RT0, ra, rb; \ veor.64 RT23q, RT23q, RT45q; \ vbsl.64 RT0, rc, rb; \ vadd.u64 rd, rd, RT1; /* d+=t1; */ \ veor.64 rh, RT2, RT3; \ \ /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ vshr.u64 RT2, rd, #14; \ vshl.u64 RT3, rd, #64 - 14; \ vadd.u64 rh, rh, RT0; \ vshr.u64 RT4, rd, #18; \ vshl.u64 RT5, rd, #64 - 18; \ vadd.u64 rh, rh, RT1; /* h+=t1; */ \ vld1.64 {RT0}, [RK]!; \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, rd, #41; \ vshl.u64 RT5, rd, #64 - 41; \ vadd.u64 RT0, RT0, rw1; \ veor.64 RT23q, RT23q, RT45q; \ vmov.64 RT7, rd; \ veor.64 RT1, RT2, RT3; \ vbsl.64 RT7, re, rf; \ \ vadd.u64 RT1, RT1, rg; \ vshr.u64 RT2, rh, #28; \ vshl.u64 RT3, rh, #64 - 28; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, rh, #34; \ vshl.u64 RT5, rh, #64 - 34; \ vadd.u64 RT1, RT1, RT7; \ \ /* g = Sum0 (h) + Maj (h, a, b); */ \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, rh, #39; \ vshl.u64 RT5, rh, #64 - 39; \ veor.64 RT0, rh, ra; \ veor.64 RT23q, RT23q, RT45q; \ vbsl.64 RT0, rb, ra; \ vadd.u64 rc, rc, RT1; /* c+=t1; */ \ veor.64 rg, RT2, RT3; \ \ /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \ \ /**** S0(w[1:2]) */ \ \ /* w[0:1] += w[9:10] */ \ /* RT23q = rw1:rw2 */ \ vext.u64 RT23q, rw01q, rw23q, #1; \ vadd.u64 rw0, rw9; \ vadd.u64 rg, rg, RT0; \ vadd.u64 rw1, rw10;\ vadd.u64 rg, rg, RT1; /* g+=t1; */ \ \ vshr.u64 RT45q, RT23q, #1; \ vshl.u64 RT67q, RT23q, #64 - 1; \ vshr.u64 RT01q, RT23q, #8; \ veor.u64 RT45q, RT45q, RT67q; \ vshl.u64 RT67q, RT23q, #64 - 8; \ veor.u64 RT45q, RT45q, RT01q; \ vshr.u64 RT01q, RT23q, #7; \ veor.u64 RT45q, RT45q, RT67q; \ \ /**** S1(w[14:15]) */ \ vshr.u64 RT23q, rw1415q, #6; \ veor.u64 RT01q, RT01q, RT45q; \ vshr.u64 RT45q, rw1415q, #19; \ vshl.u64 RT67q, rw1415q, #64 - 19; \ veor.u64 RT23q, RT23q, RT45q; \ vshr.u64 RT45q, rw1415q, #61; \ veor.u64 RT23q, RT23q, RT67q; \ vshl.u64 RT67q, rw1415q, #64 - 61; \ veor.u64 RT23q, RT23q, RT45q; \ vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \ veor.u64 RT01q, RT23q, RT67q; #define vadd_RT01q(rw01q) \ /* w[0:1] += S(w[14:15]) */ \ vadd.u64 rw01q, RT01q; #define dummy(_) /*_*/ #define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, interleave_op1, arg1, interleave_op2, arg2) \ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ vshr.u64 RT2, re, #14; \ vshl.u64 RT3, re, #64 - 14; \ interleave_op1(arg1); \ vshr.u64 RT4, re, #18; \ vshl.u64 RT5, re, #64 - 18; \ interleave_op2(arg2); \ vld1.64 {RT0}, [RK]!; \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, re, #41; \ vshl.u64 RT5, re, #64 - 41; \ vadd.u64 RT0, RT0, rw0; \ veor.64 RT23q, RT23q, RT45q; \ vmov.64 RT7, re; \ veor.64 RT1, RT2, RT3; \ vbsl.64 RT7, rf, rg; \ \ vadd.u64 RT1, RT1, rh; \ vshr.u64 RT2, ra, #28; \ vshl.u64 RT3, ra, #64 - 28; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, ra, #34; \ vshl.u64 RT5, ra, #64 - 34; \ vadd.u64 RT1, RT1, RT7; \ \ /* h = Sum0 (a) + Maj (a, b, c); */ \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, ra, #39; \ vshl.u64 RT5, ra, #64 - 39; \ veor.64 RT0, ra, rb; \ veor.64 RT23q, RT23q, RT45q; \ vbsl.64 RT0, rc, rb; \ vadd.u64 rd, rd, RT1; /* d+=t1; */ \ veor.64 rh, RT2, RT3; \ \ /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ vshr.u64 RT2, rd, #14; \ vshl.u64 RT3, rd, #64 - 14; \ vadd.u64 rh, rh, RT0; \ vshr.u64 RT4, rd, #18; \ vshl.u64 RT5, rd, #64 - 18; \ vadd.u64 rh, rh, RT1; /* h+=t1; */ \ vld1.64 {RT0}, [RK]!; \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, rd, #41; \ vshl.u64 RT5, rd, #64 - 41; \ vadd.u64 RT0, RT0, rw1; \ veor.64 RT23q, RT23q, RT45q; \ vmov.64 RT7, rd; \ veor.64 RT1, RT2, RT3; \ vbsl.64 RT7, re, rf; \ \ vadd.u64 RT1, RT1, rg; \ vshr.u64 RT2, rh, #28; \ vshl.u64 RT3, rh, #64 - 28; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, rh, #34; \ vshl.u64 RT5, rh, #64 - 34; \ vadd.u64 RT1, RT1, RT7; \ \ /* g = Sum0 (h) + Maj (h, a, b); */ \ veor.64 RT23q, RT23q, RT45q; \ vshr.u64 RT4, rh, #39; \ vshl.u64 RT5, rh, #64 - 39; \ veor.64 RT0, rh, ra; \ veor.64 RT23q, RT23q, RT45q; \ vbsl.64 RT0, rb, ra; \ vadd.u64 rc, rc, RT1; /* c+=t1; */ \ veor.64 rg, RT2, RT3; #define vadd_rg_RT0(rg) \ vadd.u64 rg, rg, RT0; #define vadd_rg_RT1(rg) \ vadd.u64 rg, rg, RT1; /* g+=t1; */ .align 3 .globl _gcry_sha512_transform_armv7_neon .type _gcry_sha512_transform_armv7_neon,%function; _gcry_sha512_transform_armv7_neon: /* Input: * %r0: SHA512_CONTEXT * %r1: data * %r2: u64 k[] constants * %r3: nblks */ push {%lr}; mov %lr, #0; /* Load context to d0-d7 */ vld1.64 {RA-RD}, [%r0]!; vld1.64 {RE-RH}, [%r0]; sub %r0, #(4*8); /* Load input to w[16], d16-d31 */ /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ vld1.64 {RW0-RW3}, [%r1]!; vld1.64 {RW4-RW7}, [%r1]!; vld1.64 {RW8-RW11}, [%r1]!; vld1.64 {RW12-RW15}, [%r1]!; #ifdef __ARMEL__ /* byteswap */ vrev64.8 RW01q, RW01q; vrev64.8 RW23q, RW23q; vrev64.8 RW45q, RW45q; vrev64.8 RW67q, RW67q; vrev64.8 RW89q, RW89q; vrev64.8 RW1011q, RW1011q; vrev64.8 RW1213q, RW1213q; vrev64.8 RW1415q, RW1415q; #endif /* EABI says that d8-d15 must be preserved by callee. */ vpush {RT0-RT7}; .Loop: rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, dummy, _); b .Lenter_rounds; .Loop_rounds: rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q); .Lenter_rounds: rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q); rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q); rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q); rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q); rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q); add %lr, #16; rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q); cmp %lr, #64; rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q); bne .Loop_rounds; subs %r3, #1; rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _); rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG); beq .Lhandle_tail; vld1.64 {RW0-RW3}, [%r1]!; rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE); rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC); #ifdef __ARMEL__ vrev64.8 RW01q, RW01q; vrev64.8 RW23q, RW23q; #endif vld1.64 {RW4-RW7}, [%r1]!; rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA); rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG); #ifdef __ARMEL__ vrev64.8 RW45q, RW45q; vrev64.8 RW67q, RW67q; #endif vld1.64 {RW8-RW11}, [%r1]!; rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); #ifdef __ARMEL__ vrev64.8 RW89q, RW89q; vrev64.8 RW1011q, RW1011q; #endif vld1.64 {RW12-RW15}, [%r1]!; vadd_rg_RT0(RA); vadd_rg_RT1(RA); /* Load context */ vld1.64 {RT0-RT3}, [%r0]!; vld1.64 {RT4-RT7}, [%r0]; sub %r0, #(4*8); #ifdef __ARMEL__ vrev64.8 RW1213q, RW1213q; vrev64.8 RW1415q, RW1415q; #endif vadd.u64 RA, RT0; vadd.u64 RB, RT1; vadd.u64 RC, RT2; vadd.u64 RD, RT3; vadd.u64 RE, RT4; vadd.u64 RF, RT5; vadd.u64 RG, RT6; vadd.u64 RH, RT7; /* Store the first half of context */ vst1.64 {RA-RD}, [%r0]!; sub RK, $(8*80); vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ mov %lr, #0; sub %r0, #(4*8); b .Loop; .ltorg .Lhandle_tail: rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE); rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC); rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA); rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG); rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); /* Load context to d16-d23 */ vld1.64 {RW0-RW3}, [%r0]!; vadd_rg_RT0(RA); vld1.64 {RW4-RW7}, [%r0]; vadd_rg_RT1(RA); sub %r0, #(4*8); vadd.u64 RA, RW0; vadd.u64 RB, RW1; vadd.u64 RC, RW2; vadd.u64 RD, RW3; vadd.u64 RE, RW4; vadd.u64 RF, RW5; vadd.u64 RG, RW6; vadd.u64 RH, RW7; /* Store the first half of context */ vst1.64 {RA-RD}, [%r0]!; /* Clear used registers */ /* d16-d31 */ veor.u64 RW01q, RW01q; veor.u64 RW23q, RW23q; veor.u64 RW45q, RW45q; veor.u64 RW67q, RW67q; vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ veor.u64 RW89q, RW89q; veor.u64 RW1011q, RW1011q; veor.u64 RW1213q, RW1213q; veor.u64 RW1415q, RW1415q; /* d8-d15 */ vpop {RT0-RT7}; /* d0-d7 (q0-q3) */ veor.u64 %q0, %q0; veor.u64 %q1, %q1; veor.u64 %q2, %q2; veor.u64 %q3, %q3; + eor %r0, %r0; pop {%pc}; .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon; #endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 9405de80..721f3405 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -1,991 +1,951 @@ /* sha512.c - SHA384 and SHA512 hash functions * Copyright (C) 2003, 2008, 2009 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* Test vectors from FIPS-180-2: * * "abc" * 384: * CB00753F 45A35E8B B5A03D69 9AC65007 272C32AB 0EDED163 * 1A8B605A 43FF5BED 8086072B A1E7CC23 58BAECA1 34C825A7 * 512: * DDAF35A1 93617ABA CC417349 AE204131 12E6FA4E 89A97EA2 0A9EEEE6 4B55D39A * 2192992A 274FC1A8 36BA3C23 A3FEEBBD 454D4423 643CE80E 2A9AC94F A54CA49F * * "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" * 384: * 09330C33 F71147E8 3D192FC7 82CD1B47 53111B17 3B3B05D2 * 2FA08086 E3B0F712 FCC7C71A 557E2DB9 66C3E9FA 91746039 * 512: * 8E959B75 DAE313DA 8CF4F728 14FC143F 8F7779C6 EB9F7FA1 7299AEAD B6889018 * 501D289E 4900F7E4 331B99DE C4B5433A C7D329EE B6DD2654 5E96E55B 874BE909 * * "a" x 1000000 * 384: * 9D0E1809 716474CB 086E834E 310A4A1C ED149E9C 00F24852 * 7972CEC5 704C2A5B 07B8B3DC 38ECC4EB AE97DDD8 7F3D8985 * 512: * E718483D 0CE76964 4E2E42C7 BC15B463 8E1F98B1 3B204428 5632A803 AFA973EB * DE0FF244 877EA60A 4CB0432C E577C31B EB009C5C 2C49AA2E 4EADB217 AD8CC09B */ #include #include #include "g10lib.h" #include "bithelp.h" #include "bufhelp.h" #include "cipher.h" #include "hash-common.h" /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */ #undef USE_ARM_NEON_ASM #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_ARM_NEON_ASM 1 # endif #endif /*ENABLE_NEON_SUPPORT*/ /* USE_ARM_ASM indicates whether to enable ARM assembly code. */ #undef USE_ARM_ASM #if defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) # define USE_ARM_ASM 1 #endif /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX indicates whether to compile with Intel AVX code. */ #undef USE_AVX #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */ #undef USE_AVX2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \ defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX2 1 #endif typedef struct { u64 h0, h1, h2, h3, h4, h5, h6, h7; } SHA512_STATE; typedef struct { gcry_md_block_ctx_t bctx; SHA512_STATE state; +} SHA512_CONTEXT; + + +static const u64 k[] = + { + U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd), + U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc), + U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019), + U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118), + U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe), + U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2), + U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1), + U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694), + U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3), + U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65), + U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483), + U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5), + U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210), + U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4), + U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725), + U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70), + U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926), + U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df), + U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8), + U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b), + U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001), + U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30), + U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910), + U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8), + U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53), + U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8), + U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb), + U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3), + U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60), + U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec), + U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9), + U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b), + U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207), + U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178), + U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6), + U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b), + U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493), + U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c), + U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a), + U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817) + }; + + +/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#undef ASM_EXTRA_STACK +#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# define ASM_EXTRA_STACK (10 * 16 + 4 * sizeof(void *)) +# else +# define ASM_FUNC_ABI +# define ASM_EXTRA_STACK 0 +# endif +#endif + + #ifdef USE_ARM_NEON_ASM - unsigned int use_neon:1; +unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd, + const unsigned char *data, + const u64 k[], size_t num_blks); + +static unsigned int +do_sha512_transform_armv7_neon(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA512_CONTEXT *hd = ctx; + return _gcry_sha512_transform_armv7_neon (&hd->state, data, k, nblks); +} #endif + #ifdef USE_SSSE3 - unsigned int use_ssse3:1; +unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data, + void *state, + size_t num_blks) ASM_FUNC_ABI; + +static unsigned int +do_sha512_transform_amd64_ssse3(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA512_CONTEXT *hd = ctx; + return _gcry_sha512_transform_amd64_ssse3 (data, &hd->state, nblks) + + ASM_EXTRA_STACK; +} #endif + #ifdef USE_AVX - unsigned int use_avx:1; +unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data, + void *state, + size_t num_blks) ASM_FUNC_ABI; + +static unsigned int +do_sha512_transform_amd64_avx(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA512_CONTEXT *hd = ctx; + return _gcry_sha512_transform_amd64_avx (data, &hd->state, nblks) + + ASM_EXTRA_STACK; +} #endif + #ifdef USE_AVX2 - unsigned int use_avx2:1; +unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data, + void *state, + size_t num_blks) ASM_FUNC_ABI; + +static unsigned int +do_sha512_transform_amd64_avx2(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA512_CONTEXT *hd = ctx; + return _gcry_sha512_transform_amd64_avx2 (data, &hd->state, nblks) + + ASM_EXTRA_STACK; +} #endif -} SHA512_CONTEXT; + + +#ifdef USE_ARM_ASM +unsigned int _gcry_sha512_transform_arm (SHA512_STATE *hd, + const unsigned char *data, + const u64 k[], size_t num_blks); static unsigned int -transform (void *context, const unsigned char *data, size_t nblks); +do_transform_generic (void *context, const unsigned char *data, size_t nblks) +{ + SHA512_CONTEXT *hd = context; + return _gcry_sha512_transform_armv7_neon (&hd->state, data, k, nblks); +} +#else +static unsigned int +do_transform_generic (void *context, const unsigned char *data, size_t nblks); +#endif + static void sha512_init (void *context, unsigned int flags) { SHA512_CONTEXT *ctx = context; SHA512_STATE *hd = &ctx->state; unsigned int features = _gcry_get_hw_features (); (void)flags; + (void)k; hd->h0 = U64_C(0x6a09e667f3bcc908); hd->h1 = U64_C(0xbb67ae8584caa73b); hd->h2 = U64_C(0x3c6ef372fe94f82b); hd->h3 = U64_C(0xa54ff53a5f1d36f1); hd->h4 = U64_C(0x510e527fade682d1); hd->h5 = U64_C(0x9b05688c2b3e6c1f); hd->h6 = U64_C(0x1f83d9abfb41bd6b); hd->h7 = U64_C(0x5be0cd19137e2179); ctx->bctx.nblocks = 0; ctx->bctx.nblocks_high = 0; ctx->bctx.count = 0; ctx->bctx.blocksize = 128; - ctx->bctx.bwrite = transform; + /* Order of feature checks is important here; last match will be + * selected. Keep slower implementations at the top and faster at + * the bottom. */ + ctx->bctx.bwrite = do_transform_generic; #ifdef USE_ARM_NEON_ASM - ctx->use_neon = (features & HWF_ARM_NEON) != 0; + if ((features & HWF_ARM_NEON) != 0) + ctx->bctx.bwrite = do_sha512_transform_armv7_neon; #endif #ifdef USE_SSSE3 - ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; + if ((features & HWF_INTEL_SSSE3) != 0) + ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3; #endif #ifdef USE_AVX - ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); + if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD)) + ctx->bctx.bwrite = do_sha512_transform_amd64_avx; #endif #ifdef USE_AVX2 - ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); + if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2)) + ctx->bctx.bwrite = do_sha512_transform_amd64_avx2; #endif - (void)features; } static void sha384_init (void *context, unsigned int flags) { SHA512_CONTEXT *ctx = context; SHA512_STATE *hd = &ctx->state; unsigned int features = _gcry_get_hw_features (); (void)flags; hd->h0 = U64_C(0xcbbb9d5dc1059ed8); hd->h1 = U64_C(0x629a292a367cd507); hd->h2 = U64_C(0x9159015a3070dd17); hd->h3 = U64_C(0x152fecd8f70e5939); hd->h4 = U64_C(0x67332667ffc00b31); hd->h5 = U64_C(0x8eb44a8768581511); hd->h6 = U64_C(0xdb0c2e0d64f98fa7); hd->h7 = U64_C(0x47b5481dbefa4fa4); ctx->bctx.nblocks = 0; ctx->bctx.nblocks_high = 0; ctx->bctx.count = 0; ctx->bctx.blocksize = 128; - ctx->bctx.bwrite = transform; + /* Order of feature checks is important here; last match will be + * selected. Keep slower implementations at the top and faster at + * the bottom. */ + ctx->bctx.bwrite = do_transform_generic; #ifdef USE_ARM_NEON_ASM - ctx->use_neon = (features & HWF_ARM_NEON) != 0; + if ((features & HWF_ARM_NEON) != 0) + ctx->bctx.bwrite = do_sha512_transform_armv7_neon; #endif #ifdef USE_SSSE3 - ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; + if ((features & HWF_INTEL_SSSE3) != 0) + ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3; #endif #ifdef USE_AVX - ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); + if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD)) + ctx->bctx.bwrite = do_sha512_transform_amd64_avx; #endif #ifdef USE_AVX2 - ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); + if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2)) + ctx->bctx.bwrite = do_sha512_transform_amd64_avx2; #endif - (void)features; } -static const u64 k[] = - { - U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd), - U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc), - U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019), - U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118), - U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe), - U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2), - U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1), - U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694), - U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3), - U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65), - U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483), - U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5), - U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210), - U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4), - U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725), - U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70), - U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926), - U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df), - U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8), - U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b), - U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001), - U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30), - U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910), - U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8), - U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53), - U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8), - U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb), - U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3), - U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60), - U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec), - U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9), - U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b), - U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207), - U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178), - U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6), - U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b), - U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493), - U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c), - U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a), - U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817) - }; - #ifndef USE_ARM_ASM static inline u64 ROTR (u64 x, u64 n) { return ((x >> n) | (x << (64 - n))); } static inline u64 Ch (u64 x, u64 y, u64 z) { return ((x & y) ^ ( ~x & z)); } static inline u64 Maj (u64 x, u64 y, u64 z) { return ((x & y) ^ (x & z) ^ (y & z)); } static inline u64 Sum0 (u64 x) { return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39)); } static inline u64 Sum1 (u64 x) { return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41)); } /**************** * Transform the message W which consists of 16 64-bit-words */ static unsigned int -transform_blk (SHA512_STATE *hd, const unsigned char *data) -{ - u64 a, b, c, d, e, f, g, h; - u64 w[16]; - int t; - - /* get values from the chaining vars */ - a = hd->h0; - b = hd->h1; - c = hd->h2; - d = hd->h3; - e = hd->h4; - f = hd->h5; - g = hd->h6; - h = hd->h7; - - for ( t = 0; t < 16; t++ ) - w[t] = buf_get_be64(data + t * 8); - -#define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) -#define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - - for (t = 0; t < 80 - 16; ) - { - u64 t1, t2; - - /* Performance on a AMD Athlon(tm) Dual Core Processor 4050e - with gcc 4.3.3 using gcry_md_hash_buffer of each 10000 bytes - initialized to 0,1,2,3...255,0,... and 1000 iterations: - - Not unrolled with macros: 440ms - Unrolled with macros: 350ms - Unrolled with inline: 330ms - */ -#if 0 /* Not unrolled. */ - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16]; - w[t%16] += S1 (w[(t - 2)%16]) + w[(t - 7)%16] + S0 (w[(t - 15)%16]); - t2 = Sum0 (a) + Maj (a, b, c); - h = g; - g = f; - f = e; - e = d + t1; - d = c; - c = b; - b = a; - a = t1 + t2; - t++; -#else /* Unrolled to interweave the chain variables. */ - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0]; - w[0] += S1 (w[14]) + w[9] + S0 (w[1]); - t2 = Sum0 (a) + Maj (a, b, c); - d += t1; - h = t1 + t2; - - t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1]; - w[1] += S1 (w[15]) + w[10] + S0 (w[2]); - t2 = Sum0 (h) + Maj (h, a, b); - c += t1; - g = t1 + t2; - - t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2]; - w[2] += S1 (w[0]) + w[11] + S0 (w[3]); - t2 = Sum0 (g) + Maj (g, h, a); - b += t1; - f = t1 + t2; - - t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3]; - w[3] += S1 (w[1]) + w[12] + S0 (w[4]); - t2 = Sum0 (f) + Maj (f, g, h); - a += t1; - e = t1 + t2; - - t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4]; - w[4] += S1 (w[2]) + w[13] + S0 (w[5]); - t2 = Sum0 (e) + Maj (e, f, g); - h += t1; - d = t1 + t2; - - t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5]; - w[5] += S1 (w[3]) + w[14] + S0 (w[6]); - t2 = Sum0 (d) + Maj (d, e, f); - g += t1; - c = t1 + t2; - - t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6]; - w[6] += S1 (w[4]) + w[15] + S0 (w[7]); - t2 = Sum0 (c) + Maj (c, d, e); - f += t1; - b = t1 + t2; - - t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7]; - w[7] += S1 (w[5]) + w[0] + S0 (w[8]); - t2 = Sum0 (b) + Maj (b, c, d); - e += t1; - a = t1 + t2; - - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8]; - w[8] += S1 (w[6]) + w[1] + S0 (w[9]); - t2 = Sum0 (a) + Maj (a, b, c); - d += t1; - h = t1 + t2; - - t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9]; - w[9] += S1 (w[7]) + w[2] + S0 (w[10]); - t2 = Sum0 (h) + Maj (h, a, b); - c += t1; - g = t1 + t2; - - t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10]; - w[10] += S1 (w[8]) + w[3] + S0 (w[11]); - t2 = Sum0 (g) + Maj (g, h, a); - b += t1; - f = t1 + t2; - - t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11]; - w[11] += S1 (w[9]) + w[4] + S0 (w[12]); - t2 = Sum0 (f) + Maj (f, g, h); - a += t1; - e = t1 + t2; - - t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12]; - w[12] += S1 (w[10]) + w[5] + S0 (w[13]); - t2 = Sum0 (e) + Maj (e, f, g); - h += t1; - d = t1 + t2; - - t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13]; - w[13] += S1 (w[11]) + w[6] + S0 (w[14]); - t2 = Sum0 (d) + Maj (d, e, f); - g += t1; - c = t1 + t2; - - t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14]; - w[14] += S1 (w[12]) + w[7] + S0 (w[15]); - t2 = Sum0 (c) + Maj (c, d, e); - f += t1; - b = t1 + t2; - - t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15]; - w[15] += S1 (w[13]) + w[8] + S0 (w[0]); - t2 = Sum0 (b) + Maj (b, c, d); - e += t1; - a = t1 + t2; - - t += 16; -#endif - } - - for (; t < 80; ) - { - u64 t1, t2; - -#if 0 /* Not unrolled. */ - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16]; - t2 = Sum0 (a) + Maj (a, b, c); - h = g; - g = f; - f = e; - e = d + t1; - d = c; - c = b; - b = a; - a = t1 + t2; - t++; -#else /* Unrolled to interweave the chain variables. */ - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0]; - t2 = Sum0 (a) + Maj (a, b, c); - d += t1; - h = t1 + t2; - - t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1]; - t2 = Sum0 (h) + Maj (h, a, b); - c += t1; - g = t1 + t2; - - t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2]; - t2 = Sum0 (g) + Maj (g, h, a); - b += t1; - f = t1 + t2; - - t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3]; - t2 = Sum0 (f) + Maj (f, g, h); - a += t1; - e = t1 + t2; - - t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4]; - t2 = Sum0 (e) + Maj (e, f, g); - h += t1; - d = t1 + t2; - - t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5]; - t2 = Sum0 (d) + Maj (d, e, f); - g += t1; - c = t1 + t2; - - t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6]; - t2 = Sum0 (c) + Maj (c, d, e); - f += t1; - b = t1 + t2; - - t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7]; - t2 = Sum0 (b) + Maj (b, c, d); - e += t1; - a = t1 + t2; - - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8]; - t2 = Sum0 (a) + Maj (a, b, c); - d += t1; - h = t1 + t2; - - t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9]; - t2 = Sum0 (h) + Maj (h, a, b); - c += t1; - g = t1 + t2; - - t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10]; - t2 = Sum0 (g) + Maj (g, h, a); - b += t1; - f = t1 + t2; - - t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11]; - t2 = Sum0 (f) + Maj (f, g, h); - a += t1; - e = t1 + t2; - - t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12]; - t2 = Sum0 (e) + Maj (e, f, g); - h += t1; - d = t1 + t2; - - t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13]; - t2 = Sum0 (d) + Maj (d, e, f); - g += t1; - c = t1 + t2; - - t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14]; - t2 = Sum0 (c) + Maj (c, d, e); - f += t1; - b = t1 + t2; - - t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15]; - t2 = Sum0 (b) + Maj (b, c, d); - e += t1; - a = t1 + t2; - - t += 16; -#endif - } - - /* Update chaining vars. */ - hd->h0 += a; - hd->h1 += b; - hd->h2 += c; - hd->h3 += d; - hd->h4 += e; - hd->h5 += f; - hd->h6 += g; - hd->h7 += h; - - return /* burn_stack */ (8 + 16) * sizeof(u64) + sizeof(u32) + - 3 * sizeof(void*); -} -#endif /*!USE_ARM_ASM*/ - -/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional - * stack to store XMM6-XMM15 needed on Win64. */ -#undef ASM_FUNC_ABI -#undef ASM_EXTRA_STACK -#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) -# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS -# define ASM_FUNC_ABI __attribute__((sysv_abi)) -# define ASM_EXTRA_STACK (10 * 16) -# else -# define ASM_FUNC_ABI -# define ASM_EXTRA_STACK 0 -# endif -#endif - - -#ifdef USE_ARM_NEON_ASM -void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd, - const unsigned char *data, - const u64 k[], size_t num_blks); -#endif - -#ifdef USE_ARM_ASM -unsigned int _gcry_sha512_transform_arm (SHA512_STATE *hd, - const unsigned char *data, - const u64 k[], size_t num_blks); -#endif - -#ifdef USE_SSSE3 -unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data, - void *state, - size_t num_blks) ASM_FUNC_ABI; -#endif - -#ifdef USE_AVX -unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data, - void *state, - size_t num_blks) ASM_FUNC_ABI; -#endif - -#ifdef USE_AVX2 -unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data, - void *state, - size_t num_blks) ASM_FUNC_ABI; -#endif - - -static unsigned int -transform (void *context, const unsigned char *data, size_t nblks) +do_transform_generic (void *context, const unsigned char *data, size_t nblks) { SHA512_CONTEXT *ctx = context; - unsigned int burn; - -#ifdef USE_AVX2 - if (ctx->use_avx2) - return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, nblks) - + 4 * sizeof(void*) + ASM_EXTRA_STACK; -#endif - -#ifdef USE_AVX - if (ctx->use_avx) - return _gcry_sha512_transform_amd64_avx (data, &ctx->state, nblks) - + 4 * sizeof(void*) + ASM_EXTRA_STACK; -#endif - -#ifdef USE_SSSE3 - if (ctx->use_ssse3) - return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, nblks) - + 4 * sizeof(void*) + ASM_EXTRA_STACK; -#endif + SHA512_STATE *hd = &ctx->state; -#ifdef USE_ARM_NEON_ASM - if (ctx->use_neon) + do { - _gcry_sha512_transform_armv7_neon (&ctx->state, data, k, nblks); + u64 a, b, c, d, e, f, g, h; + u64 w[16]; + int t; + + /* get values from the chaining vars */ + a = hd->h0; + b = hd->h1; + c = hd->h2; + d = hd->h3; + e = hd->h4; + f = hd->h5; + g = hd->h6; + h = hd->h7; + + for ( t = 0; t < 16; t++ ) + w[t] = buf_get_be64(data + t * 8); - /* _gcry_sha512_transform_armv7_neon does not store sensitive data - * to stack. */ - return /* no burn_stack */ 0; - } -#endif +#define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) +#define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + + for (t = 0; t < 80 - 16; ) + { + u64 t1, t2; + + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0]; + w[0] += S1 (w[14]) + w[9] + S0 (w[1]); + t2 = Sum0 (a) + Maj (a, b, c); + d += t1; + h = t1 + t2; + + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1]; + w[1] += S1 (w[15]) + w[10] + S0 (w[2]); + t2 = Sum0 (h) + Maj (h, a, b); + c += t1; + g = t1 + t2; + + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2]; + w[2] += S1 (w[0]) + w[11] + S0 (w[3]); + t2 = Sum0 (g) + Maj (g, h, a); + b += t1; + f = t1 + t2; + + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3]; + w[3] += S1 (w[1]) + w[12] + S0 (w[4]); + t2 = Sum0 (f) + Maj (f, g, h); + a += t1; + e = t1 + t2; + + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4]; + w[4] += S1 (w[2]) + w[13] + S0 (w[5]); + t2 = Sum0 (e) + Maj (e, f, g); + h += t1; + d = t1 + t2; + + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5]; + w[5] += S1 (w[3]) + w[14] + S0 (w[6]); + t2 = Sum0 (d) + Maj (d, e, f); + g += t1; + c = t1 + t2; + + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6]; + w[6] += S1 (w[4]) + w[15] + S0 (w[7]); + t2 = Sum0 (c) + Maj (c, d, e); + f += t1; + b = t1 + t2; + + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7]; + w[7] += S1 (w[5]) + w[0] + S0 (w[8]); + t2 = Sum0 (b) + Maj (b, c, d); + e += t1; + a = t1 + t2; + + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8]; + w[8] += S1 (w[6]) + w[1] + S0 (w[9]); + t2 = Sum0 (a) + Maj (a, b, c); + d += t1; + h = t1 + t2; + + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9]; + w[9] += S1 (w[7]) + w[2] + S0 (w[10]); + t2 = Sum0 (h) + Maj (h, a, b); + c += t1; + g = t1 + t2; + + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10]; + w[10] += S1 (w[8]) + w[3] + S0 (w[11]); + t2 = Sum0 (g) + Maj (g, h, a); + b += t1; + f = t1 + t2; + + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11]; + w[11] += S1 (w[9]) + w[4] + S0 (w[12]); + t2 = Sum0 (f) + Maj (f, g, h); + a += t1; + e = t1 + t2; + + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12]; + w[12] += S1 (w[10]) + w[5] + S0 (w[13]); + t2 = Sum0 (e) + Maj (e, f, g); + h += t1; + d = t1 + t2; + + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13]; + w[13] += S1 (w[11]) + w[6] + S0 (w[14]); + t2 = Sum0 (d) + Maj (d, e, f); + g += t1; + c = t1 + t2; + + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14]; + w[14] += S1 (w[12]) + w[7] + S0 (w[15]); + t2 = Sum0 (c) + Maj (c, d, e); + f += t1; + b = t1 + t2; + + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15]; + w[15] += S1 (w[13]) + w[8] + S0 (w[0]); + t2 = Sum0 (b) + Maj (b, c, d); + e += t1; + a = t1 + t2; + + t += 16; + } + + for (; t < 80; ) + { + u64 t1, t2; + + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0]; + t2 = Sum0 (a) + Maj (a, b, c); + d += t1; + h = t1 + t2; + + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1]; + t2 = Sum0 (h) + Maj (h, a, b); + c += t1; + g = t1 + t2; + + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2]; + t2 = Sum0 (g) + Maj (g, h, a); + b += t1; + f = t1 + t2; + + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3]; + t2 = Sum0 (f) + Maj (f, g, h); + a += t1; + e = t1 + t2; + + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4]; + t2 = Sum0 (e) + Maj (e, f, g); + h += t1; + d = t1 + t2; + + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5]; + t2 = Sum0 (d) + Maj (d, e, f); + g += t1; + c = t1 + t2; + + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6]; + t2 = Sum0 (c) + Maj (c, d, e); + f += t1; + b = t1 + t2; + + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7]; + t2 = Sum0 (b) + Maj (b, c, d); + e += t1; + a = t1 + t2; + + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8]; + t2 = Sum0 (a) + Maj (a, b, c); + d += t1; + h = t1 + t2; + + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9]; + t2 = Sum0 (h) + Maj (h, a, b); + c += t1; + g = t1 + t2; + + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10]; + t2 = Sum0 (g) + Maj (g, h, a); + b += t1; + f = t1 + t2; + + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11]; + t2 = Sum0 (f) + Maj (f, g, h); + a += t1; + e = t1 + t2; + + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12]; + t2 = Sum0 (e) + Maj (e, f, g); + h += t1; + d = t1 + t2; + + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13]; + t2 = Sum0 (d) + Maj (d, e, f); + g += t1; + c = t1 + t2; + + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14]; + t2 = Sum0 (c) + Maj (c, d, e); + f += t1; + b = t1 + t2; + + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15]; + t2 = Sum0 (b) + Maj (b, c, d); + e += t1; + a = t1 + t2; + + t += 16; + } + + /* Update chaining vars. */ + hd->h0 += a; + hd->h1 += b; + hd->h2 += c; + hd->h3 += d; + hd->h4 += e; + hd->h5 += f; + hd->h6 += g; + hd->h7 += h; -#ifdef USE_ARM_ASM - burn = _gcry_sha512_transform_arm (&ctx->state, data, k, nblks); -#else - do - { - burn = transform_blk (&ctx->state, data) + 3 * sizeof(void*); data += 128; } while (--nblks); -#ifdef ASM_EXTRA_STACK - /* 'transform_blk' is typically inlined and XMM6-XMM15 are stored at - * the prologue of this function. Therefore need to add ASM_EXTRA_STACK to - * here too. - */ - burn += ASM_EXTRA_STACK; -#endif -#endif - - return burn; + return (8 + 16) * sizeof(u64) + sizeof(u32) + 3 * sizeof(void*); } +#endif /*!USE_ARM_ASM*/ /* The routine final terminates the computation and * returns the digest. * The handle is prepared for a new cycle, but adding bytes to the * handle will the destroy the returned buffer. * Returns: 64 bytes representing the digest. When used for sha384, * we take the leftmost 48 of those bytes. */ static void sha512_final (void *context) { SHA512_CONTEXT *hd = context; unsigned int stack_burn_depth; u64 t, th, msb, lsb; byte *p; _gcry_md_block_write (context, NULL, 0); /* flush */ ; t = hd->bctx.nblocks; /* if (sizeof t == sizeof hd->bctx.nblocks) */ th = hd->bctx.nblocks_high; /* else */ /* th = hd->bctx.nblocks >> 64; In case we ever use u128 */ /* multiply by 128 to make a byte count */ lsb = t << 7; msb = (th << 7) | (t >> 57); /* add the count */ t = lsb; if ((lsb += hd->bctx.count) < t) msb++; /* multiply by 8 to make a bit count */ t = lsb; lsb <<= 3; msb <<= 3; msb |= t >> 61; if (hd->bctx.count < 112) { /* enough room */ hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ while (hd->bctx.count < 112) hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ } else { /* need one extra block */ hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ while (hd->bctx.count < 128) hd->bctx.buf[hd->bctx.count++] = 0; _gcry_md_block_write (context, NULL, 0); /* flush */ ; memset (hd->bctx.buf, 0, 112); /* fill next block with zeroes */ } /* append the 128 bit count */ buf_put_be64(hd->bctx.buf + 112, msb); buf_put_be64(hd->bctx.buf + 120, lsb); - stack_burn_depth = transform (hd, hd->bctx.buf, 1); + stack_burn_depth = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1); _gcry_burn_stack (stack_burn_depth); p = hd->bctx.buf; #define X(a) do { buf_put_be64(p, hd->state.h##a); p += 8; } while (0) X (0); X (1); X (2); X (3); X (4); X (5); /* Note that these last two chunks are included even for SHA384. We just ignore them. */ X (6); X (7); #undef X } static byte * sha512_read (void *context) { SHA512_CONTEXT *hd = (SHA512_CONTEXT *) context; return hd->bctx.buf; } /* Shortcut functions which puts the hash value of the supplied buffer * into outbuf which must have a size of 64 bytes. */ void _gcry_sha512_hash_buffer (void *outbuf, const void *buffer, size_t length) { SHA512_CONTEXT hd; sha512_init (&hd, 0); _gcry_md_block_write (&hd, buffer, length); sha512_final (&hd); memcpy (outbuf, hd.bctx.buf, 64); } /* Variant of the above shortcut function using multiple buffers. */ void _gcry_sha512_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt) { SHA512_CONTEXT hd; sha512_init (&hd, 0); for (;iovcnt > 0; iov++, iovcnt--) _gcry_md_block_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len); sha512_final (&hd); memcpy (outbuf, hd.bctx.buf, 64); } /* Shortcut functions which puts the hash value of the supplied buffer * into outbuf which must have a size of 48 bytes. */ static void _gcry_sha384_hash_buffer (void *outbuf, const void *buffer, size_t length) { SHA512_CONTEXT hd; sha384_init (&hd, 0); _gcry_md_block_write (&hd, buffer, length); sha512_final (&hd); memcpy (outbuf, hd.bctx.buf, 48); } /* Variant of the above shortcut function using multiple buffers. */ static void _gcry_sha384_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt) { SHA512_CONTEXT hd; sha384_init (&hd, 0); for (;iovcnt > 0; iov++, iovcnt--) _gcry_md_block_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len); sha512_final (&hd); memcpy (outbuf, hd.bctx.buf, 48); } /* Self-test section. */ static gpg_err_code_t selftests_sha384 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "short string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA384, 0, "abc", 3, "\xcb\x00\x75\x3f\x45\xa3\x5e\x8b\xb5\xa0\x3d\x69\x9a\xc6\x50\x07" "\x27\x2c\x32\xab\x0e\xde\xd1\x63\x1a\x8b\x60\x5a\x43\xff\x5b\xed" "\x80\x86\x07\x2b\xa1\xe7\xcc\x23\x58\xba\xec\xa1\x34\xc8\x25\xa7", 48); if (errtxt) goto failed; if (extended) { what = "long string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA384, 0, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn" "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112, "\x09\x33\x0C\x33\xF7\x11\x47\xE8\x3D\x19\x2F\xC7\x82\xCD\x1B\x47" "\x53\x11\x1B\x17\x3B\x3B\x05\xD2\x2F\xA0\x80\x86\xE3\xB0\xF7\x12" "\xFC\xC7\xC7\x1A\x55\x7E\x2D\xB9\x66\xC3\xE9\xFA\x91\x74\x60\x39", 48); if (errtxt) goto failed; what = "one million \"a\""; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA384, 1, NULL, 0, "\x9D\x0E\x18\x09\x71\x64\x74\xCB\x08\x6E\x83\x4E\x31\x0A\x4A\x1C" "\xED\x14\x9E\x9C\x00\xF2\x48\x52\x79\x72\xCE\xC5\x70\x4C\x2A\x5B" "\x07\xB8\xB3\xDC\x38\xEC\xC4\xEB\xAE\x97\xDD\xD8\x7F\x3D\x89\x85", 48); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("digest", GCRY_MD_SHA384, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } static gpg_err_code_t selftests_sha512 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "short string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA512, 0, "abc", 3, "\xDD\xAF\x35\xA1\x93\x61\x7A\xBA\xCC\x41\x73\x49\xAE\x20\x41\x31" "\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A" "\x21\x92\x99\x2A\x27\x4F\xC1\xA8\x36\xBA\x3C\x23\xA3\xFE\xEB\xBD" "\x45\x4D\x44\x23\x64\x3C\xE8\x0E\x2A\x9A\xC9\x4F\xA5\x4C\xA4\x9F", 64); if (errtxt) goto failed; if (extended) { what = "long string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA512, 0, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn" "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112, "\x8E\x95\x9B\x75\xDA\xE3\x13\xDA\x8C\xF4\xF7\x28\x14\xFC\x14\x3F" "\x8F\x77\x79\xC6\xEB\x9F\x7F\xA1\x72\x99\xAE\xAD\xB6\x88\x90\x18" "\x50\x1D\x28\x9E\x49\x00\xF7\xE4\x33\x1B\x99\xDE\xC4\xB5\x43\x3A" "\xC7\xD3\x29\xEE\xB6\xDD\x26\x54\x5E\x96\xE5\x5B\x87\x4B\xE9\x09", 64); if (errtxt) goto failed; what = "one million \"a\""; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA512, 1, NULL, 0, "\xE7\x18\x48\x3D\x0C\xE7\x69\x64\x4E\x2E\x42\xC7\xBC\x15\xB4\x63" "\x8E\x1F\x98\xB1\x3B\x20\x44\x28\x56\x32\xA8\x03\xAF\xA9\x73\xEB" "\xDE\x0F\xF2\x44\x87\x7E\xA6\x0A\x4C\xB0\x43\x2C\xE5\x77\xC3\x1B" "\xEB\x00\x9C\x5C\x2C\x49\xAA\x2E\x4E\xAD\xB2\x17\xAD\x8C\xC0\x9B", 64); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("digest", GCRY_MD_SHA512, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t run_selftests (int algo, int extended, selftest_report_func_t report) { gpg_err_code_t ec; switch (algo) { case GCRY_MD_SHA384: ec = selftests_sha384 (extended, report); break; case GCRY_MD_SHA512: ec = selftests_sha512 (extended, report); break; default: ec = GPG_ERR_DIGEST_ALGO; break; } return ec; } static byte sha512_asn[] = /* Object ID is 2.16.840.1.101.3.4.2.3 */ { 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, 0x05, 0x00, 0x04, 0x40 }; static gcry_md_oid_spec_t oid_spec_sha512[] = { { "2.16.840.1.101.3.4.2.3" }, /* PKCS#1 sha512WithRSAEncryption */ { "1.2.840.113549.1.1.13" }, { NULL } }; gcry_md_spec_t _gcry_digest_spec_sha512 = { GCRY_MD_SHA512, {0, 1}, "SHA512", sha512_asn, DIM (sha512_asn), oid_spec_sha512, 64, sha512_init, _gcry_md_block_write, sha512_final, sha512_read, NULL, _gcry_sha512_hash_buffer, _gcry_sha512_hash_buffers, sizeof (SHA512_CONTEXT), run_selftests }; static byte sha384_asn[] = /* Object ID is 2.16.840.1.101.3.4.2.2 */ { 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, 0x05, 0x00, 0x04, 0x30 }; static gcry_md_oid_spec_t oid_spec_sha384[] = { { "2.16.840.1.101.3.4.2.2" }, /* PKCS#1 sha384WithRSAEncryption */ { "1.2.840.113549.1.1.12" }, /* SHA384WithECDSA: RFC 7427 (A.3.3.) */ { "1.2.840.10045.4.3.3" }, { NULL }, }; gcry_md_spec_t _gcry_digest_spec_sha384 = { GCRY_MD_SHA384, {0, 1}, "SHA384", sha384_asn, DIM (sha384_asn), oid_spec_sha384, 48, sha384_init, _gcry_md_block_write, sha512_final, sha512_read, NULL, _gcry_sha384_hash_buffer, _gcry_sha384_hash_buffers, sizeof (SHA512_CONTEXT), run_selftests };