diff --git a/cipher/asm-inline-s390x.h b/cipher/asm-inline-s390x.h index 82007531..9a7bf453 100644 --- a/cipher/asm-inline-s390x.h +++ b/cipher/asm-inline-s390x.h @@ -1,114 +1,139 @@ /* asm-inline-s390x.h - Common macros for zSeries inline assembly * * Copyright (C) 2020 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef GCRY_ASM_INLINE_S390X_H #define GCRY_ASM_INLINE_S390X_H #include +#define ALWAYS_INLINE inline __attribute__((always_inline)) + typedef unsigned int u128_t __attribute__ ((mode (TI))); enum kmxx_functions_e { KM_FUNCTION_AES_128 = 18, KM_FUNCTION_AES_192 = 19, KM_FUNCTION_AES_256 = 20, KM_FUNCTION_XTS_AES_128 = 50, KM_FUNCTION_XTS_AES_256 = 52, KMID_FUNCTION_SHA1 = 1, KMID_FUNCTION_SHA256 = 2, KMID_FUNCTION_SHA512 = 3, KMID_FUNCTION_SHA3_224 = 32, KMID_FUNCTION_SHA3_256 = 33, KMID_FUNCTION_SHA3_384 = 34, KMID_FUNCTION_SHA3_512 = 35, KMID_FUNCTION_SHAKE128 = 36, KMID_FUNCTION_SHAKE256 = 37, KMID_FUNCTION_GHASH = 65, }; enum kmxx_function_flags_e { KM_ENCRYPT = 0 << 7, KM_DECRYPT = 1 << 7, KMF_LCFB_16 = 16 << 24, KMA_LPC = 1 << 8, KMA_LAAD = 1 << 9, KMA_HS = 1 << 10, }; -static inline u128_t km_function_to_mask(enum kmxx_functions_e func) +static ALWAYS_INLINE u128_t km_function_to_mask(enum kmxx_functions_e func) { return (u128_t)1 << (127 - func); } static inline u128_t kimd_query(void) { static u128_t function_codes = 0; static int initialized = 0; register unsigned long reg0 asm("0") = 0; register void *reg1 asm("1") = &function_codes; u128_t r1; if (initialized) return function_codes; asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t" " brc 1,0b\n\t" : [r1] "=a" (r1) : [reg0] "r" (reg0), [reg1] "r" (reg1) : "cc", "memory"); initialized = 1; return function_codes; } -static inline void kimd_execute(unsigned int func, void *param_block, - const void *src, size_t src_len) +static inline u128_t klmd_query(void) +{ + static u128_t function_codes = 0; + static int initialized = 0; + register unsigned long reg0 asm("0") = 0; + register void *reg1 asm("1") = &function_codes; + u128_t r1; + + if (initialized) + return function_codes; + + asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t" + " brc 1,0b\n\t" + : [r1] "=a" (r1) + : [reg0] "r" (reg0), [reg1] "r" (reg1) + : "cc", "memory"); + + initialized = 1; + return function_codes; +} + +static ALWAYS_INLINE void +kimd_execute(unsigned int func, void *param_block, const void *src, + size_t src_len) { register unsigned long reg0 asm("0") = func; register byte *reg1 asm("1") = param_block; u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t" " brc 1,0b\n\t" : [r1] "+a" (r1) : [func] "r" (reg0), [param_ptr] "r" (reg1) : "cc", "memory"); } -static inline void klmd_execute(unsigned int func, void *param_block, - const void *src, size_t src_len) +static ALWAYS_INLINE void +klmd_execute(unsigned int func, void *param_block, const void *src, + size_t src_len) { register unsigned long reg0 asm("0") = func; register byte *reg1 asm("1") = param_block; u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t" " brc 1,0b\n\t" : [r1] "+a" (r1) : [func] "r" (reg0), [param_ptr] "r" (reg1) : "cc", "memory"); } #endif /* GCRY_ASM_INLINE_S390X_H */ diff --git a/cipher/rijndael-s390x.c b/cipher/rijndael-s390x.c index 5ab019f9..aea65c5a 100644 --- a/cipher/rijndael-s390x.c +++ b/cipher/rijndael-s390x.c @@ -1,1156 +1,1155 @@ /* Rijndael (AES) for GnuPG - s390x/zSeries AES implementation * Copyright (C) 2020 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #include "rijndael-internal.h" #include "cipher-internal.h" #include "bufhelp.h" #ifdef USE_S390X_CRYPTO #include "asm-inline-s390x.h" -#define ALWAYS_INLINE inline __attribute__((always_inline)) #define NO_INLINE __attribute__((noinline)) struct aes_s390x_gcm_params_s { u32 reserved[3]; u32 counter_value; u64 tag[2]; u64 hash_subkey[2]; u64 total_aad_length; u64 total_cipher_length; u32 initial_counter_value[4]; u64 key[4]; }; #define DECL_QUERY_FUNC(instruction, opcode) \ static u128_t instruction ##_query(void) \ { \ static u128_t function_codes = 0; \ static int initialized = 0; \ register unsigned long reg0 asm("0") = 0; \ register void *reg1 asm("1") = &function_codes; \ u128_t r1, r2; \ \ if (initialized) \ return function_codes; \ \ asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \ " brc 1,0b\n\t" \ : [r1] "=a" (r1), [r2] "=a" (r2) \ : [reg0] "r" (reg0), [reg1] "r" (reg1) \ : "cc", "memory"); \ \ initialized = 1; \ return function_codes; \ } #define DECL_EXECUTE_FUNC(instruction, opcode, param_const) \ static ALWAYS_INLINE size_t \ instruction ##_execute(unsigned int func, param_const void *param_block, \ void *dst, const void *src, size_t src_len) \ { \ register unsigned long reg0 asm("0") = func; \ register param_const byte *reg1 asm("1") = param_block; \ u128_t r1 = ((u128_t)(uintptr_t)dst << 64); \ u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; \ \ asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \ " brc 1,0b\n\t" \ : [r1] "+a" (r1), [r2] "+a" (r2) \ : [func] "r" (reg0), [param_ptr] "r" (reg1) \ : "cc", "memory"); \ \ return (u64)r2; \ } DECL_QUERY_FUNC(km, 0xb92e); DECL_QUERY_FUNC(kmc, 0xb92f); DECL_QUERY_FUNC(kmac, 0xb91e); DECL_QUERY_FUNC(kmf, 0xb92a); DECL_QUERY_FUNC(kmo, 0xb92b); DECL_EXECUTE_FUNC(km, 0xb92e, const); DECL_EXECUTE_FUNC(kmc, 0xb92f, ); DECL_EXECUTE_FUNC(kmac, 0xb91e, ); DECL_EXECUTE_FUNC(kmf, 0xb92a, ); DECL_EXECUTE_FUNC(kmo, 0xb92b, ); static u128_t kma_query(void) { static u128_t function_codes = 0; static int initialized = 0; register unsigned long reg0 asm("0") = 0; register void *reg1 asm("1") = &function_codes; u128_t r1, r2, r3; if (initialized) return function_codes; asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t" " brc 1,0b\n\t" : [r1] "=a" (r1), [r2] "=a" (r2), [r3] "=a" (r3) : [reg0] "r" (reg0), [reg1] "r" (reg1) : "cc", "memory"); initialized = 1; return function_codes; } static ALWAYS_INLINE void kma_execute(unsigned int func, void *param_block, byte *dst, const byte *src, size_t src_len, const byte *aad, size_t aad_len) { register unsigned long reg0 asm("0") = func; register byte *reg1 asm("1") = param_block; u128_t r1 = ((u128_t)(uintptr_t)dst << 64); u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; u128_t r3 = ((u128_t)(uintptr_t)aad << 64) | (u64)aad_len; asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t" " brc 1,0b\n\t" : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3), [func] "+r" (reg0) : [param_ptr] "r" (reg1) : "cc", "memory"); } unsigned int _gcry_aes_s390x_encrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, dst, src, BLOCKSIZE); return 0; } unsigned int _gcry_aes_s390x_decrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { km_execute (ctx->km_func | KM_DECRYPT, ctx->keyschenc, dst, src, BLOCKSIZE); return 0; } static void aes_s390x_cbc_enc(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac) { RIJNDAEL_context *ctx = context; byte *out = outbuf_arg; const byte *in = inbuf_arg; u128_t params[3]; /* Prepare parameter block. */ memcpy (¶ms[0], iv, BLOCKSIZE); memcpy (¶ms[1], ctx->keyschenc, 32); if (cbc_mac) { kmac_execute (ctx->kmac_func | KM_ENCRYPT, ¶ms, NULL, in, nblocks * BLOCKSIZE); memcpy (out, ¶ms[0], BLOCKSIZE); } else { kmc_execute (ctx->kmc_func | KM_ENCRYPT, ¶ms, out, in, nblocks * BLOCKSIZE); } /* Update IV with OCV. */ memcpy (iv, ¶ms[0], BLOCKSIZE); wipememory (¶ms, sizeof(params)); } static void aes_s390x_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; byte *out = outbuf_arg; const byte *in = inbuf_arg; u128_t params[3]; /* Prepare parameter block (ICV & key). */ memcpy (¶ms[0], iv, BLOCKSIZE); memcpy (¶ms[1], ctx->keyschenc, 32); kmc_execute (ctx->kmc_func | KM_DECRYPT, ¶ms, out, in, nblocks * BLOCKSIZE); /* Update IV with OCV. */ memcpy (iv, ¶ms[0], BLOCKSIZE); wipememory (¶ms, sizeof(params)); } static void aes_s390x_cfb128_enc(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; byte *out = outbuf_arg; const byte *in = inbuf_arg; unsigned int function; u128_t params[3]; /* Prepare parameter block. */ memcpy (¶ms[0], iv, BLOCKSIZE); memcpy (¶ms[1], ctx->keyschenc, 32); function = ctx->kmf_func | KM_ENCRYPT | KMF_LCFB_16; kmf_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE); /* Update IV with OCV. */ memcpy (iv, ¶ms[0], BLOCKSIZE); wipememory (¶ms, sizeof(params)); } static void aes_s390x_cfb128_dec(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; u128_t blocks[64]; byte *out = outbuf_arg; const byte *in = inbuf_arg; size_t max_blocks_used = 0; /* AES128-CFB128 decryption speed using KMF was observed to be the same as * the KMF encryption, ~1.03 cpb. Expection was to see similar performance * as for AES128-CBC decryption as decryption for both modes should be * parallalizeble (CBC shows ~0.22 cpb). Therefore there is quite a bit * of room for improvement and implementation below using KM instruction * shows ~0.70 cpb speed, ~30% improvement over KMF instruction. */ while (nblocks >= 64) { /* Copy IV to encrypt buffer, copy (nblocks - 1) input blocks to * encrypt buffer and update IV. */ asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t" "mvc 16(240, %[blocks]), 0(%[in])\n\t" "mvc 256(256, %[blocks]), 240(%[in])\n\t" "mvc 512(256, %[blocks]), 496(%[in])\n\t" "mvc 768(256, %[blocks]), 752(%[in])\n\t" "mvc 0(16, %[iv]), 1008(%[in])\n\t" : : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks), [iv] "a" (iv) : "memory"); /* Perform encryption of temporary buffer. */ km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks, 64 * BLOCKSIZE); /* Xor encrypt buffer with input blocks and store to output blocks. */ asm volatile ("xc 0(256, %[blocks]), 0(%[in])\n\t" "xc 256(256, %[blocks]), 256(%[in])\n\t" "xc 512(256, %[blocks]), 512(%[in])\n\t" "xc 768(256, %[blocks]), 768(%[in])\n\t" "mvc 0(256, %[out]), 0(%[blocks])\n\t" "mvc 256(256, %[out]), 256(%[blocks])\n\t" "mvc 512(256, %[out]), 512(%[blocks])\n\t" "mvc 768(256, %[out]), 768(%[blocks])\n\t" : : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks) : "memory"); max_blocks_used = 64; in += 64 * BLOCKSIZE; out += 64 * BLOCKSIZE; nblocks -= 64; } if (nblocks) { unsigned int pos = 0; size_t in_nblocks = nblocks; size_t num_in = 0; max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; /* Copy IV to encrypt buffer. */ asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t" : : [blocks] "a" (blocks), [iv] "a" (iv) : "memory"); pos += 1; #define CFB_MOVE_BLOCKS(block_oper, move_nbytes) \ block_oper (in_nblocks - 1 >= move_nbytes / BLOCKSIZE) \ { \ unsigned int move_nblocks = move_nbytes / BLOCKSIZE; \ asm volatile ("mvc 0(" #move_nbytes ", %[blocks_x]), 0(%[in])\n\t" \ : \ : [blocks_x] "a" (&blocks[pos]), [in] "a" (in) \ : "memory"); \ num_in += move_nblocks; \ in += move_nblocks * BLOCKSIZE; \ pos += move_nblocks; \ in_nblocks -= move_nblocks; \ } /* Copy (nblocks - 1) input blocks to encrypt buffer. */ CFB_MOVE_BLOCKS(while, 256); CFB_MOVE_BLOCKS(if, 128); CFB_MOVE_BLOCKS(if, 64); CFB_MOVE_BLOCKS(if, 32); CFB_MOVE_BLOCKS(if, 16); #undef CFB_MOVE_BLOCKS /* Update IV. */ asm volatile ("mvc 0(16, %[iv]), 0(%[in])\n\t" : : [iv] "a" (iv), [in] "a" (in) : "memory"); num_in += 1; in += BLOCKSIZE; /* Perform encryption of temporary buffer. */ km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks, nblocks * BLOCKSIZE); /* Xor encrypt buffer with input blocks and store to output blocks. */ pos = 0; in -= nblocks * BLOCKSIZE; #define CFB_XOR_BLOCKS(block_oper, xor_nbytes) \ block_oper (nblocks >= xor_nbytes / BLOCKSIZE) \ { \ unsigned int xor_nblocks = xor_nbytes / BLOCKSIZE; \ asm volatile ("xc 0(" #xor_nbytes ", %[blocks_x]), 0(%[in])\n\t" \ "mvc 0(" #xor_nbytes ", %[out]), 0(%[blocks_x])\n\t" \ : \ : [blocks_x] "a" (&blocks[pos]), [out] "a" (out), \ [in] "a" (in) \ : "memory"); \ out += xor_nblocks * BLOCKSIZE; \ in += xor_nblocks * BLOCKSIZE; \ nblocks -= xor_nblocks; \ pos += xor_nblocks; \ } CFB_XOR_BLOCKS(while, 256); CFB_XOR_BLOCKS(if, 128); CFB_XOR_BLOCKS(if, 64); CFB_XOR_BLOCKS(if, 32); CFB_XOR_BLOCKS(if, 16); #undef CFB_XOR_BLOCKS } if (max_blocks_used) wipememory (&blocks, max_blocks_used * BLOCKSIZE); } static void aes_s390x_ofb_enc(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; byte *out = outbuf_arg; const byte *in = inbuf_arg; unsigned int function; u128_t params[3]; /* Prepare parameter block. */ memcpy (¶ms[0], iv, BLOCKSIZE); memcpy (¶ms[1], ctx->keyschenc, 32); function = ctx->kmo_func | KM_ENCRYPT; kmo_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE); /* Update IV with OCV. */ memcpy (iv, ¶ms[0], BLOCKSIZE); wipememory (¶ms, sizeof(params)); } static void aes_s390x_ctr128_enc(void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; byte *out = outbuf_arg; const byte *in = inbuf_arg; unsigned int function; struct aes_s390x_gcm_params_s params; memset (¶ms.hash_subkey, 0, sizeof(params.hash_subkey)); memcpy (¶ms.key, ctx->keyschenc, 32); function = ctx->kma_func | KM_DECRYPT | KMA_HS | KMA_LAAD; while (nblocks) { u64 to_overflow = (u64)0xFFFFFFFFU + 1 - buf_get_be32 (ctr + 12); u64 ncurr = nblocks > to_overflow ? to_overflow : nblocks; /* Prepare parameter block. */ memset (¶ms.reserved, 0, sizeof(params.reserved)); buf_put_be32 (¶ms.counter_value, buf_get_be32(ctr + 12) - 1); memcpy (¶ms.initial_counter_value, ctr, 16); params.initial_counter_value[3] = params.counter_value; memset (¶ms.tag, 0, sizeof(params.tag)); params.total_aad_length = 0; params.total_cipher_length = 0; /* Update counter. */ cipher_block_add (ctr, ncurr, BLOCKSIZE); if (ncurr == (u64)0xFFFFFFFFU + 1) cipher_block_add (ctr, 1, BLOCKSIZE); /* Perform CTR using KMA-GCM. */ kma_execute (function, ¶ms, out, in, ncurr * BLOCKSIZE, NULL, 0); out += ncurr * BLOCKSIZE; in += ncurr * BLOCKSIZE; nblocks -= ncurr; } wipememory (¶ms, sizeof(params)); } static size_t aes_s390x_gcm_crypt(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = (void *)&c->context.c; byte *out = outbuf_arg; const byte *in = inbuf_arg; byte *ctr = c->u_ctr.ctr; unsigned int function; struct aes_s390x_gcm_params_s params; function = ctx->kma_func | (encrypt ? KM_ENCRYPT : KM_DECRYPT) | KMA_HS | KMA_LAAD; /* Prepare parameter block. */ memset (¶ms.reserved, 0, sizeof(params.reserved)); buf_put_be32 (¶ms.counter_value, buf_get_be32(ctr + 12) - 1); memcpy (¶ms.tag, c->u_mode.gcm.u_tag.tag, 16); memcpy (¶ms.hash_subkey, c->u_mode.gcm.u_ghash_key.key, 16); params.total_aad_length = 0; params.total_cipher_length = 0; memcpy (¶ms.initial_counter_value, ctr, 12); params.initial_counter_value[3] = params.counter_value; memcpy (¶ms.key, ctx->keyschenc, 32); /* Update counter (CTR32). */ buf_put_be32(ctr + 12, buf_get_be32(ctr + 12) + nblocks); /* Perform KMA-GCM. */ kma_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE, NULL, 0); /* Update tag. */ memcpy (c->u_mode.gcm.u_tag.tag, ¶ms.tag, 16); wipememory (¶ms, sizeof(params)); return 0; } static void aes_s390x_xts_crypt(void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = context; byte *out = outbuf_arg; const byte *in = inbuf_arg; unsigned int function; u128_t params[3]; u128_t *params_tweak; if (ctx->rounds < 12) { memcpy (¶ms[0], ctx->keyschenc, 16); params_tweak = ¶ms[1]; memcpy (params_tweak, tweak, BLOCKSIZE); } else if (ctx->rounds == 12) { BUG(); /* KM-XTS-AES-192 not defined. */ } else { memcpy (¶ms[0], ctx->keyschenc, 32); params_tweak = ¶ms[2]; memcpy (params_tweak, tweak, BLOCKSIZE); } function = ctx->km_func_xts | (encrypt ? KM_ENCRYPT : KM_DECRYPT); km_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE); /* Update tweak with XTSP. */ memcpy (tweak, params_tweak, BLOCKSIZE); wipememory (¶ms, sizeof(params)); } static NO_INLINE void aes_s390x_ocb_prepare_Ls (gcry_cipher_hd_t c, u64 blkn, const void *Ls[64], const void ***pl) { unsigned int n = 64 - (blkn % 64); int i; /* Prepare L pointers. */ *pl = &Ls[(63 + n) % 64]; for (i = 0; i < 64; i += 8, n = (n + 8) % 64) { static const int lastL[8] = { 3, 4, 3, 5, 3, 4, 3, 0 }; Ls[(0 + n) % 64] = c->u_mode.ocb.L[0]; Ls[(1 + n) % 64] = c->u_mode.ocb.L[1]; Ls[(2 + n) % 64] = c->u_mode.ocb.L[0]; Ls[(3 + n) % 64] = c->u_mode.ocb.L[2]; Ls[(4 + n) % 64] = c->u_mode.ocb.L[0]; Ls[(5 + n) % 64] = c->u_mode.ocb.L[1]; Ls[(6 + n) % 64] = c->u_mode.ocb.L[0]; Ls[(7 + n) % 64] = c->u_mode.ocb.L[lastL[i / 8]]; } } static NO_INLINE void aes_s390x_ocb_checksum (unsigned char *checksum, const void *plainbuf_arg, size_t nblks) { const char *plainbuf = plainbuf_arg; u64 tmp0[2]; u64 tmp1[2] = { 0, 0 }; u64 tmp2[2] = { 0, 0 }; u64 tmp3[2] = { 0, 0 }; cipher_block_cpy (tmp0, checksum, BLOCKSIZE); if (nblks >= 4) { while (nblks >= 4) { /* Checksum_i = Checksum_{i-1} xor P_i */ cipher_block_xor_1 (tmp0, plainbuf + 0 * BLOCKSIZE, BLOCKSIZE); cipher_block_xor_1 (tmp1, plainbuf + 1 * BLOCKSIZE, BLOCKSIZE); cipher_block_xor_1 (tmp2, plainbuf + 2 * BLOCKSIZE, BLOCKSIZE); cipher_block_xor_1 (tmp3, plainbuf + 3 * BLOCKSIZE, BLOCKSIZE); plainbuf += 4 * BLOCKSIZE; nblks -= 4; } cipher_block_xor_1 (tmp0, tmp1, BLOCKSIZE); cipher_block_xor_1 (tmp2, tmp3, BLOCKSIZE); cipher_block_xor_1 (tmp0, tmp2, BLOCKSIZE); wipememory (tmp1, sizeof(tmp1)); wipememory (tmp2, sizeof(tmp2)); wipememory (tmp3, sizeof(tmp3)); } while (nblks > 0) { /* Checksum_i = Checksum_{i-1} xor P_i */ cipher_block_xor_1 (tmp0, plainbuf, BLOCKSIZE); plainbuf += BLOCKSIZE; nblks--; } cipher_block_cpy (checksum, tmp0, BLOCKSIZE); wipememory (tmp0, sizeof(tmp0)); } static NO_INLINE size_t aes_s390x_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks_arg) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; size_t nblocks = nblocks_arg; u128_t blocks[64]; u128_t offset; size_t max_blocks_used = 0; u64 blkn = c->u_mode.ocb.data_nblocks; unsigned int function = ctx->km_func | KM_ENCRYPT; const void *Ls[64]; const void **pl; aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl); /* Checksumming could be done inline in OCB_INPUT macros, but register * pressure becomes too heavy and performance would end up being worse. * For decryption, checksumming is part of OCB_OUTPUT macros as * output handling is less demanding and can handle the additional * computation. */ aes_s390x_ocb_checksum (c->u_ctr.ctr, inbuf_arg, nblocks_arg); cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE); #define OCB_INPUT(n) \ cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \ cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \ &offset, BLOCKSIZE) #define OCB_INPUT_4(n) \ OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \ OCB_INPUT((n) + 3) #define OCB_INPUT_16(n) \ OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \ OCB_INPUT_4((n) + 12); #define OCB_OUTPUT(n) \ cipher_block_xor_1 (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE) #define OCB_OUTPUT_4(n) \ OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \ OCB_OUTPUT((n) + 3) #define OCB_OUTPUT_16(n) \ OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \ OCB_OUTPUT_4((n) + 12); while (nblocks >= 64) { blkn += 64; *pl = ocb_get_l(c, blkn - blkn % 64); OCB_INPUT_16(0); OCB_INPUT_16(16); OCB_INPUT_16(32); OCB_INPUT_16(48); km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE); asm volatile ("xc 0(256, %[out]), 0(%[blocks])\n\t" "xc 256(256, %[out]), 256(%[blocks])\n\t" "xc 512(256, %[out]), 512(%[blocks])\n\t" "xc 768(256, %[out]), 768(%[blocks])\n\t" : : [out] "a" (outbuf), [blocks] "a" (blocks) : "memory"); max_blocks_used = 64; inbuf += 64 * BLOCKSIZE; outbuf += 64 * BLOCKSIZE; nblocks -= 64; } if (nblocks) { unsigned int pos = 0; max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; blkn += nblocks; *pl = ocb_get_l(c, blkn - blkn % 64); while (nblocks >= 16) { OCB_INPUT_16(pos + 0); pos += 16; nblocks -= 16; } while (nblocks >= 4) { OCB_INPUT_4(pos + 0); pos += 4; nblocks -= 4; } if (nblocks >= 2) { OCB_INPUT(pos + 0); OCB_INPUT(pos + 1); pos += 2; nblocks -= 2; } if (nblocks >= 1) { OCB_INPUT(pos + 0); pos += 1; nblocks -= 1; } nblocks = pos; pos = 0; km_execute (function, ctx->keyschenc, outbuf, outbuf, nblocks * BLOCKSIZE); while (nblocks >= 16) { OCB_OUTPUT_16(pos + 0); pos += 16; nblocks -= 16; } while (nblocks >= 4) { OCB_OUTPUT_4(pos + 0); pos += 4; nblocks -= 4; } if (nblocks >= 2) { OCB_OUTPUT(pos + 0); OCB_OUTPUT(pos + 1); pos += 2; nblocks -= 2; } if (nblocks >= 1) { OCB_OUTPUT(pos + 0); pos += 1; nblocks -= 1; } } #undef OCB_INPUT #undef OCB_INPUT_4 #undef OCB_INPUT_16 #undef OCB_OUTPUT #undef OCB_OUTPUT_4 #undef OCB_OUTPUT_16 c->u_mode.ocb.data_nblocks = blkn; cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE); if (max_blocks_used) wipememory (&blocks, max_blocks_used * BLOCKSIZE); return 0; } static NO_INLINE size_t aes_s390x_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks_arg) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; size_t nblocks = nblocks_arg; u128_t blocks[64]; u128_t offset; size_t max_blocks_used = 0; u64 blkn = c->u_mode.ocb.data_nblocks; unsigned int function = ctx->km_func | KM_DECRYPT; const void *Ls[64]; const void **pl; aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl); cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE); #define OCB_INPUT(n) \ cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \ cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \ &offset, BLOCKSIZE) #define OCB_INPUT_4(n) \ OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \ OCB_INPUT((n) + 3) #define OCB_INPUT_16(n) \ OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \ OCB_INPUT_4((n) + 12); #define OCB_OUTPUT(n) \ cipher_block_xor_1 (&blocks[n], outbuf + (n) * BLOCKSIZE, BLOCKSIZE); \ cipher_block_xor_1 (c->u_ctr.ctr, &blocks[n], BLOCKSIZE); \ cipher_block_cpy (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE); #define OCB_OUTPUT_4(n) \ OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \ OCB_OUTPUT((n) + 3) #define OCB_OUTPUT_16(n) \ OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \ OCB_OUTPUT_4((n) + 12); while (nblocks >= 64) { blkn += 64; *pl = ocb_get_l(c, blkn - blkn % 64); OCB_INPUT_16(0); OCB_INPUT_16(16); OCB_INPUT_16(32); OCB_INPUT_16(48); km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE); asm volatile ("xc 0(256, %[out]), 0(%[blocks])\n\t" "xc 256(256, %[out]), 256(%[blocks])\n\t" "xc 512(256, %[out]), 512(%[blocks])\n\t" "xc 768(256, %[out]), 768(%[blocks])\n\t" : : [out] "a" (outbuf), [blocks] "a" (blocks) : "memory"); max_blocks_used = 64; inbuf += 64 * BLOCKSIZE; outbuf += 64 * BLOCKSIZE; nblocks -= 64; } if (nblocks) { unsigned int pos = 0; max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; blkn += nblocks; *pl = ocb_get_l(c, blkn - blkn % 64); while (nblocks >= 16) { OCB_INPUT_16(pos + 0); pos += 16; nblocks -= 16; } while (nblocks >= 4) { OCB_INPUT_4(pos + 0); pos += 4; nblocks -= 4; } if (nblocks >= 2) { OCB_INPUT(pos + 0); OCB_INPUT(pos + 1); pos += 2; nblocks -= 2; } if (nblocks >= 1) { OCB_INPUT(pos + 0); pos += 1; nblocks -= 1; } nblocks = pos; pos = 0; km_execute (function, ctx->keyschenc, outbuf, outbuf, nblocks * BLOCKSIZE); while (nblocks >= 16) { OCB_OUTPUT_16(pos + 0); pos += 16; nblocks -= 16; } while (nblocks >= 4) { OCB_OUTPUT_4(pos + 0); pos += 4; nblocks -= 4; } if (nblocks >= 2) { OCB_OUTPUT(pos + 0); OCB_OUTPUT(pos + 1); pos += 2; nblocks -= 2; } if (nblocks >= 1) { OCB_OUTPUT(pos + 0); pos += 1; nblocks -= 1; } } #undef OCB_INPUT #undef OCB_INPUT_4 #undef OCB_INPUT_16 #undef OCB_OUTPUT #undef OCB_OUTPUT_4 #undef OCB_OUTPUT_16 c->u_mode.ocb.data_nblocks = blkn; cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE); if (max_blocks_used) wipememory (&blocks, max_blocks_used * BLOCKSIZE); return 0; } static size_t aes_s390x_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks_arg, int encrypt) { if (encrypt) return aes_s390x_ocb_enc (c, outbuf_arg, inbuf_arg, nblocks_arg); else return aes_s390x_ocb_dec (c, outbuf_arg, inbuf_arg, nblocks_arg); } static size_t aes_s390x_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks_arg) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; u128_t blocks[64]; u128_t offset; size_t max_blocks_used = 0; u64 blkn = c->u_mode.ocb.aad_nblocks; unsigned int function = ctx->km_func | KM_ENCRYPT; const void *Ls[64]; const void **pl; aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl); cipher_block_cpy (&offset, c->u_mode.ocb.aad_offset, BLOCKSIZE); #define OCB_INPUT(n) \ cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \ cipher_block_xor_1 (&blocks[n], abuf + (n) * BLOCKSIZE, BLOCKSIZE) #define OCB_INPUT_4(n) \ OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \ OCB_INPUT((n) + 3) #define OCB_INPUT_16(n) \ OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \ OCB_INPUT_4((n) + 12); while (nblocks_arg >= 64) { blkn += 64; *pl = ocb_get_l(c, blkn - blkn % 64); OCB_INPUT_16(0); OCB_INPUT_16(16); OCB_INPUT_16(32); OCB_INPUT_16(48); km_execute (function, ctx->keyschenc, blocks, blocks, 64 * BLOCKSIZE); aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, 64); max_blocks_used = 64; abuf += 64 * BLOCKSIZE; nblocks_arg -= 64; } if (nblocks_arg > 0) { size_t nblocks = nblocks_arg; unsigned int pos = 0; max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; blkn += nblocks; *pl = ocb_get_l(c, blkn - blkn % 64); while (nblocks >= 16) { OCB_INPUT_16(pos + 0); pos += 16; nblocks -= 16; } while (nblocks >= 4) { OCB_INPUT_4(pos + 0); pos += 4; nblocks -= 4; } if (nblocks >= 2) { OCB_INPUT(pos + 0); OCB_INPUT(pos + 1); pos += 2; nblocks -= 2; } if (nblocks >= 1) { OCB_INPUT(pos + 0); pos += 1; nblocks -= 1; } nblocks = pos; nblocks_arg -= pos; pos = 0; km_execute (function, ctx->keyschenc, blocks, blocks, nblocks * BLOCKSIZE); aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, nblocks); } #undef OCB_INPUT #undef OCB_INPUT_4 #undef OCB_INPUT_16 c->u_mode.ocb.aad_nblocks = blkn; cipher_block_cpy (c->u_mode.ocb.aad_offset, &offset, BLOCKSIZE); if (max_blocks_used) wipememory (&blocks, max_blocks_used * BLOCKSIZE); return 0; } int _gcry_aes_s390x_setup_acceleration(RIJNDAEL_context *ctx, unsigned int keylen, unsigned int hwfeatures, cipher_bulk_ops_t *bulk_ops) { unsigned int func; unsigned int func_xts; u128_t func_mask; u128_t func_xts_mask; if (!(hwfeatures & HWF_S390X_MSA)) return 0; switch (keylen) { default: case 16: func = KM_FUNCTION_AES_128; func_xts = KM_FUNCTION_XTS_AES_128; func_mask = km_function_to_mask(KM_FUNCTION_AES_128); func_xts_mask = km_function_to_mask(KM_FUNCTION_XTS_AES_128); break; case 24: func = KM_FUNCTION_AES_192; func_xts = 0; func_mask = km_function_to_mask(KM_FUNCTION_AES_192); func_xts_mask = 0; /* XTS-AES192 not available. */ break; case 32: func = KM_FUNCTION_AES_256; func_xts = KM_FUNCTION_XTS_AES_256; func_mask = km_function_to_mask(KM_FUNCTION_AES_256); func_xts_mask = km_function_to_mask(KM_FUNCTION_AES_256); break; } /* Query KM for supported algorithms and check if acceleration for * requested key-length is available. */ if (!(km_query () & func_mask)) return 0; ctx->km_func = func; /* Query KM for supported XTS algorithms. */ if (km_query () & func_xts_mask) ctx->km_func_xts = func_xts; /* Query KMC for supported algorithms. */ if (kmc_query () & func_mask) ctx->kmc_func = func; /* Query KMAC for supported algorithms. */ if (kmac_query () & func_mask) ctx->kmac_func = func; if (hwfeatures & HWF_S390X_MSA_4) { /* Query KMF for supported algorithms. */ if (kmf_query () & func_mask) ctx->kmf_func = func; /* Query KMO for supported algorithms. */ if (kmo_query () & func_mask) ctx->kmo_func = func; } if (hwfeatures & HWF_S390X_MSA_8) { /* Query KMA for supported algorithms. */ if (kma_query () & func_mask) ctx->kma_func = func; } /* Setup zSeries bulk encryption/decryption routines. */ if (ctx->km_func) { bulk_ops->ocb_crypt = aes_s390x_ocb_crypt; bulk_ops->ocb_auth = aes_s390x_ocb_auth; /* CFB128 decryption uses KM instruction, instead of KMF. */ bulk_ops->cfb_dec = aes_s390x_cfb128_dec; } if (ctx->km_func_xts) { bulk_ops->xts_crypt = aes_s390x_xts_crypt; } if (ctx->kmc_func) { if(ctx->kmac_func) { /* Either KMC or KMAC used depending on 'cbc_mac' parameter. */ bulk_ops->cbc_enc = aes_s390x_cbc_enc; } bulk_ops->cbc_dec = aes_s390x_cbc_dec; } if (ctx->kmf_func) { bulk_ops->cfb_enc = aes_s390x_cfb128_enc; } if (ctx->kmo_func) { bulk_ops->ofb_enc = aes_s390x_ofb_enc; } if (ctx->kma_func) { bulk_ops->ctr_enc = aes_s390x_ctr128_enc; if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH)) { /* KIMD based GHASH implementation is required with AES-GCM * acceleration. */ bulk_ops->gcm_crypt = aes_s390x_gcm_crypt; } } return 1; } void _gcry_aes_s390x_setkey(RIJNDAEL_context *ctx, const byte *key) { unsigned int keylen = 16 + (ctx->rounds - 10) * 4; memcpy (ctx->keyschenc, key, keylen); } void _gcry_aes_s390x_prepare_decryption(RIJNDAEL_context *ctx) { /* Do nothing. */ (void)ctx; } #endif /* USE_S390X_CRYPTO */ diff --git a/cipher/sha1.c b/cipher/sha1.c index d3ee982b..287bd826 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -1,709 +1,765 @@ /* sha1.c - SHA1 hash function * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* Test vectors: * * "abc" * A999 3E36 4706 816A BA3E 2571 7850 C26C 9CD0 D89D * * "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" * 8498 3E44 1C3B D26E BAAE 4AA1 F951 29E5 E546 70F1 */ #include #include #include #include #ifdef HAVE_STDINT_H # include #endif #include "g10lib.h" #include "bithelp.h" #include "bufhelp.h" #include "cipher.h" #include "sha1.h" /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX indicates whether to compile with Intel AVX code. */ #undef USE_AVX #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX 1 #endif /* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */ #undef USE_BMI2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ defined(HAVE_GCC_INLINE_ASM_BMI2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_BMI2 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */ #undef USE_AVX2 #if defined(USE_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX2) # define USE_AVX2 1 #endif /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */ #undef USE_SHAEXT #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \ defined(HAVE_GCC_INLINE_ASM_SSE41) && \ defined(ENABLE_SHAEXT_SUPPORT) # define USE_SHAEXT 1 #endif /* USE_NEON indicates whether to enable ARM NEON assembly code. */ #undef USE_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_NEON 1 # endif #endif /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly * code. */ #undef USE_ARM_CE #ifdef ENABLE_ARM_CRYPTO_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) # define USE_ARM_CE 1 # elif defined(__AARCH64EL__) \ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) # define USE_ARM_CE 1 # endif #endif + /* A macro to test whether P is properly aligned for an u32 type. Note that config.h provides a suitable replacement for uintptr_t if it does not exist in stdint.h. */ /* #if __GNUC__ >= 2 */ /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % __alignof__ (u32))) */ /* #else */ /* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % sizeof (u32))) */ /* #endif */ /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK #if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \ defined(USE_SHAEXT) # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define ASM_FUNC_ABI __attribute__((sysv_abi)) # define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4) # else # define ASM_FUNC_ABI # define ASM_EXTRA_STACK 0 # endif #endif #ifdef USE_SSSE3 unsigned int _gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data, size_t nblks) ASM_FUNC_ABI; static unsigned int do_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks) + ASM_EXTRA_STACK; } #endif #ifdef USE_AVX unsigned int _gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data, size_t nblks) ASM_FUNC_ABI; static unsigned int do_sha1_transform_amd64_avx (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks) + ASM_EXTRA_STACK; } #endif #ifdef USE_BMI2 unsigned int _gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data, size_t nblks) ASM_FUNC_ABI; static unsigned int do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks) + ASM_EXTRA_STACK; } #ifdef USE_AVX2 unsigned int _gcry_sha1_transform_amd64_avx2_bmi2 (void *state, const unsigned char *data, size_t nblks) ASM_FUNC_ABI; static unsigned int do_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; /* AVX2/BMI2 function only handles pair of blocks so nblks needs to be * multiple of 2 and function does not handle zero nblks. Use AVX/BMI2 * code to handle these cases. */ if (nblks <= 1) return do_sha1_transform_amd64_avx_bmi2 (ctx, data, nblks); if (nblks & 1) { (void)_gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, 1); nblks--; data += 64; } return _gcry_sha1_transform_amd64_avx2_bmi2 (&hd->h0, data, nblks) + ASM_EXTRA_STACK; } #endif /* USE_AVX2 */ #endif /* USE_BMI2 */ #ifdef USE_SHAEXT /* Does not need ASM_FUNC_ABI */ unsigned int _gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data, size_t nblks); static unsigned int do_sha1_transform_intel_shaext (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; return _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks); } #endif #ifdef USE_NEON unsigned int _gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data, size_t nblks); static unsigned int do_sha1_transform_armv7_neon (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks); } #endif #ifdef USE_ARM_CE unsigned int _gcry_sha1_transform_armv8_ce (void *state, const unsigned char *data, size_t nblks); static unsigned int do_sha1_transform_armv8_ce (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; return _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks); } #endif +#ifdef SHA1_USE_S390X_CRYPTO +#include "asm-inline-s390x.h" + +static unsigned int +do_sha1_transform_s390x (void *ctx, const unsigned char *data, size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + + kimd_execute (KMID_FUNCTION_SHA1, &hd->h0, data, nblks * 64); + return 0; +} + +static unsigned int +do_sha1_final_s390x (void *ctx, const unsigned char *data, size_t datalen, + u32 len_msb, u32 len_lsb) +{ + SHA1_CONTEXT *hd = ctx; + + /* Make sure that 'final_len' is positioned at correct offset relative + * to 'h0'. This is because we are passing 'h0' pointer as start of + * parameter block to 'klmd' instruction. */ + + gcry_assert (offsetof (SHA1_CONTEXT, final_len_msb) + - offsetof (SHA1_CONTEXT, h0) == 5 * sizeof(u32)); + gcry_assert (offsetof (SHA1_CONTEXT, final_len_lsb) + - offsetof (SHA1_CONTEXT, final_len_msb) == 1 * sizeof(u32)); + + hd->final_len_msb = len_msb; + hd->final_len_lsb = len_lsb; + + klmd_execute (KMID_FUNCTION_SHA1, &hd->h0, data, datalen); + return 0; +} +#endif + static unsigned int do_transform_generic (void *c, const unsigned char *data, size_t nblks); static void sha1_init (void *context, unsigned int flags) { SHA1_CONTEXT *hd = context; unsigned int features = _gcry_get_hw_features (); (void)flags; hd->h0 = 0x67452301; hd->h1 = 0xefcdab89; hd->h2 = 0x98badcfe; hd->h3 = 0x10325476; hd->h4 = 0xc3d2e1f0; hd->bctx.nblocks = 0; hd->bctx.nblocks_high = 0; hd->bctx.count = 0; hd->bctx.blocksize_shift = _gcry_ctz(64); /* Order of feature checks is important here; last match will be * selected. Keep slower implementations at the top and faster at * the bottom. */ hd->bctx.bwrite = do_transform_generic; #ifdef USE_SSSE3 if ((features & HWF_INTEL_SSSE3) != 0) hd->bctx.bwrite = do_sha1_transform_amd64_ssse3; #endif #ifdef USE_AVX /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. * Therefore use this implementation on Intel CPUs only. */ if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD)) hd->bctx.bwrite = do_sha1_transform_amd64_avx; #endif #ifdef USE_BMI2 if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2)) hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2; #endif #ifdef USE_AVX2 if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2)) hd->bctx.bwrite = do_sha1_transform_amd64_avx2_bmi2; #endif #ifdef USE_SHAEXT if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1)) hd->bctx.bwrite = do_sha1_transform_intel_shaext; #endif #ifdef USE_NEON if ((features & HWF_ARM_NEON) != 0) hd->bctx.bwrite = do_sha1_transform_armv7_neon; #endif #ifdef USE_ARM_CE if ((features & HWF_ARM_SHA1) != 0) hd->bctx.bwrite = do_sha1_transform_armv8_ce; #endif +#ifdef SHA1_USE_S390X_CRYPTO + hd->use_s390x_crypto = 0; + if ((features & HWF_S390X_MSA) != 0) + { + if ((kimd_query () & km_function_to_mask (KMID_FUNCTION_SHA1)) && + (klmd_query () & km_function_to_mask (KMID_FUNCTION_SHA1))) + { + hd->bctx.bwrite = do_sha1_transform_s390x; + hd->use_s390x_crypto = 1; + } + } +#endif (void)features; } /* * Initialize the context HD. This is used to prepare the use of * _gcry_sha1_mixblock. WARNING: This is a special purpose function * for exclusive use by random-csprng.c. */ void _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd) { sha1_init (hd, 0); } /* Round function macros. */ #define K1 0x5A827999L #define K2 0x6ED9EBA1L #define K3 0x8F1BBCDCL #define K4 0xCA62C1D6L #define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) ) #define F2(x,y,z) ( x ^ y ^ z ) #define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) ) #define F4(x,y,z) ( x ^ y ^ z ) #define M(i) ( tm = x[ i &0x0f] \ ^ x[(i-14)&0x0f] \ ^ x[(i-8) &0x0f] \ ^ x[(i-3) &0x0f], \ (x[i&0x0f] = rol(tm, 1))) #define R(a,b,c,d,e,f,k,m) do { e += rol( a, 5 ) \ + f( b, c, d ) \ + k \ + m; \ b = rol( b, 30 ); \ } while(0) /* * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA. */ static unsigned int do_transform_generic (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; do { const u32 *idata = (const void *)data; u32 a, b, c, d, e; /* Local copies of the chaining variables. */ u32 tm; /* Helper. */ u32 x[16]; /* The array we work on. */ #define I(i) (x[i] = buf_get_be32(idata + i)) /* Get the values of the chaining variables. */ a = hd->h0; b = hd->h1; c = hd->h2; d = hd->h3; e = hd->h4; /* Transform. */ R( a, b, c, d, e, F1, K1, I( 0) ); R( e, a, b, c, d, F1, K1, I( 1) ); R( d, e, a, b, c, F1, K1, I( 2) ); R( c, d, e, a, b, F1, K1, I( 3) ); R( b, c, d, e, a, F1, K1, I( 4) ); R( a, b, c, d, e, F1, K1, I( 5) ); R( e, a, b, c, d, F1, K1, I( 6) ); R( d, e, a, b, c, F1, K1, I( 7) ); R( c, d, e, a, b, F1, K1, I( 8) ); R( b, c, d, e, a, F1, K1, I( 9) ); R( a, b, c, d, e, F1, K1, I(10) ); R( e, a, b, c, d, F1, K1, I(11) ); R( d, e, a, b, c, F1, K1, I(12) ); R( c, d, e, a, b, F1, K1, I(13) ); R( b, c, d, e, a, F1, K1, I(14) ); R( a, b, c, d, e, F1, K1, I(15) ); R( e, a, b, c, d, F1, K1, M(16) ); R( d, e, a, b, c, F1, K1, M(17) ); R( c, d, e, a, b, F1, K1, M(18) ); R( b, c, d, e, a, F1, K1, M(19) ); R( a, b, c, d, e, F2, K2, M(20) ); R( e, a, b, c, d, F2, K2, M(21) ); R( d, e, a, b, c, F2, K2, M(22) ); R( c, d, e, a, b, F2, K2, M(23) ); R( b, c, d, e, a, F2, K2, M(24) ); R( a, b, c, d, e, F2, K2, M(25) ); R( e, a, b, c, d, F2, K2, M(26) ); R( d, e, a, b, c, F2, K2, M(27) ); R( c, d, e, a, b, F2, K2, M(28) ); R( b, c, d, e, a, F2, K2, M(29) ); R( a, b, c, d, e, F2, K2, M(30) ); R( e, a, b, c, d, F2, K2, M(31) ); R( d, e, a, b, c, F2, K2, M(32) ); R( c, d, e, a, b, F2, K2, M(33) ); R( b, c, d, e, a, F2, K2, M(34) ); R( a, b, c, d, e, F2, K2, M(35) ); R( e, a, b, c, d, F2, K2, M(36) ); R( d, e, a, b, c, F2, K2, M(37) ); R( c, d, e, a, b, F2, K2, M(38) ); R( b, c, d, e, a, F2, K2, M(39) ); R( a, b, c, d, e, F3, K3, M(40) ); R( e, a, b, c, d, F3, K3, M(41) ); R( d, e, a, b, c, F3, K3, M(42) ); R( c, d, e, a, b, F3, K3, M(43) ); R( b, c, d, e, a, F3, K3, M(44) ); R( a, b, c, d, e, F3, K3, M(45) ); R( e, a, b, c, d, F3, K3, M(46) ); R( d, e, a, b, c, F3, K3, M(47) ); R( c, d, e, a, b, F3, K3, M(48) ); R( b, c, d, e, a, F3, K3, M(49) ); R( a, b, c, d, e, F3, K3, M(50) ); R( e, a, b, c, d, F3, K3, M(51) ); R( d, e, a, b, c, F3, K3, M(52) ); R( c, d, e, a, b, F3, K3, M(53) ); R( b, c, d, e, a, F3, K3, M(54) ); R( a, b, c, d, e, F3, K3, M(55) ); R( e, a, b, c, d, F3, K3, M(56) ); R( d, e, a, b, c, F3, K3, M(57) ); R( c, d, e, a, b, F3, K3, M(58) ); R( b, c, d, e, a, F3, K3, M(59) ); R( a, b, c, d, e, F4, K4, M(60) ); R( e, a, b, c, d, F4, K4, M(61) ); R( d, e, a, b, c, F4, K4, M(62) ); R( c, d, e, a, b, F4, K4, M(63) ); R( b, c, d, e, a, F4, K4, M(64) ); R( a, b, c, d, e, F4, K4, M(65) ); R( e, a, b, c, d, F4, K4, M(66) ); R( d, e, a, b, c, F4, K4, M(67) ); R( c, d, e, a, b, F4, K4, M(68) ); R( b, c, d, e, a, F4, K4, M(69) ); R( a, b, c, d, e, F4, K4, M(70) ); R( e, a, b, c, d, F4, K4, M(71) ); R( d, e, a, b, c, F4, K4, M(72) ); R( c, d, e, a, b, F4, K4, M(73) ); R( b, c, d, e, a, F4, K4, M(74) ); R( a, b, c, d, e, F4, K4, M(75) ); R( e, a, b, c, d, F4, K4, M(76) ); R( d, e, a, b, c, F4, K4, M(77) ); R( c, d, e, a, b, F4, K4, M(78) ); R( b, c, d, e, a, F4, K4, M(79) ); /* Update the chaining variables. */ hd->h0 += a; hd->h1 += b; hd->h2 += c; hd->h3 += d; hd->h4 += e; data += 64; } while (--nblks); return 88+4*sizeof(void*); } /* * Apply the SHA-1 transform function on the buffer BLOCKOF64BYTE * which must have a length 64 bytes. BLOCKOF64BYTE must be 32-bit * aligned. Updates the 20 bytes in BLOCKOF64BYTE with its mixed * content. Returns the number of bytes which should be burned on the * stack. You need to use _gcry_sha1_mixblock_init to initialize the * context. * WARNING: This is a special purpose function for exclusive use by * random-csprng.c. */ unsigned int _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte) { u32 *p = blockof64byte; unsigned int nburn; nburn = (*hd->bctx.bwrite) (hd, blockof64byte, 1); p[0] = hd->h0; p[1] = hd->h1; p[2] = hd->h2; p[3] = hd->h3; p[4] = hd->h4; return nburn; } /* The routine final terminates the computation and * returns the digest. * The handle is prepared for a new cycle, but adding bytes to the * handle will the destroy the returned buffer. * Returns: 20 bytes representing the digest. */ static void sha1_final(void *context) { SHA1_CONTEXT *hd = context; u32 t, th, msb, lsb; unsigned char *p; unsigned int burn; t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; else th = hd->bctx.nblocks >> 32; /* multiply by 64 to make a byte count */ lsb = t << 6; msb = (th << 6) | (t >> 26); /* add the count */ t = lsb; if( (lsb += hd->bctx.count) < t ) msb++; /* multiply by 8 to make a bit count */ t = lsb; lsb <<= 3; msb <<= 3; msb |= t >> 29; - if (hd->bctx.count < 56) /* enough room */ + if (0) + { } +#ifdef SHA1_USE_S390X_CRYPTO + else if (hd->use_s390x_crypto) + { + burn = do_sha1_final_s390x (hd, hd->bctx.buf, hd->bctx.count, msb, lsb); + } +#endif + else if (hd->bctx.count < 56) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ if (hd->bctx.count < 56) memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); hd->bctx.count = 56; /* append the 64 bit count */ buf_put_be32(hd->bctx.buf + 56, msb); buf_put_be32(hd->bctx.buf + 60, lsb); burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 ); } else /* need one extra block */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ /* fill pad and next block with zeroes */ memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); hd->bctx.count = 64 + 56; /* append the 64 bit count */ buf_put_be32(hd->bctx.buf + 64 + 56, msb); buf_put_be32(hd->bctx.buf + 64 + 60, lsb); burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 ); } p = hd->bctx.buf; #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0) X(0); X(1); X(2); X(3); X(4); #undef X _gcry_burn_stack (burn); } static unsigned char * sha1_read( void *context ) { SHA1_CONTEXT *hd = context; return hd->bctx.buf; } /**************** * Shortcut functions which puts the hash value of the supplied buffer * into outbuf which must have a size of 20 bytes. */ void _gcry_sha1_hash_buffer (void *outbuf, const void *buffer, size_t length) { SHA1_CONTEXT hd; sha1_init (&hd, 0); _gcry_md_block_write (&hd, buffer, length); sha1_final (&hd); memcpy (outbuf, hd.bctx.buf, 20); } /* Variant of the above shortcut function using a multiple buffers. */ void _gcry_sha1_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt) { SHA1_CONTEXT hd; sha1_init (&hd, 0); for (;iovcnt > 0; iov++, iovcnt--) _gcry_md_block_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len); sha1_final (&hd); memcpy (outbuf, hd.bctx.buf, 20); } /* Self-test section. */ static gpg_err_code_t selftests_sha1 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "short string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA1, 0, "abc", 3, "\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E" "\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D", 20); if (errtxt) goto failed; if (extended) { what = "long string"; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA1, 0, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56, "\x84\x98\x3E\x44\x1C\x3B\xD2\x6E\xBA\xAE" "\x4A\xA1\xF9\x51\x29\xE5\xE5\x46\x70\xF1", 20); if (errtxt) goto failed; what = "one million \"a\""; errtxt = _gcry_hash_selftest_check_one (GCRY_MD_SHA1, 1, NULL, 0, "\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E" "\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F", 20); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("digest", GCRY_MD_SHA1, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t run_selftests (int algo, int extended, selftest_report_func_t report) { gpg_err_code_t ec; switch (algo) { case GCRY_MD_SHA1: ec = selftests_sha1 (extended, report); break; default: ec = GPG_ERR_DIGEST_ALGO; break; } return ec; } static unsigned char asn[15] = /* Object ID is 1.3.14.3.2.26 */ { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, 0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 }; static gcry_md_oid_spec_t oid_spec_sha1[] = { /* iso.member-body.us.rsadsi.pkcs.pkcs-1.5 (sha1WithRSAEncryption) */ { "1.2.840.113549.1.1.5" }, /* iso.member-body.us.x9-57.x9cm.3 (dsaWithSha1)*/ { "1.2.840.10040.4.3" }, /* from NIST's OIW (sha1) */ { "1.3.14.3.2.26" }, /* from NIST OIW (sha-1WithRSAEncryption) */ { "1.3.14.3.2.29" }, /* iso.member-body.us.ansi-x9-62.signatures.ecdsa-with-sha1 */ { "1.2.840.10045.4.1" }, { NULL }, }; gcry_md_spec_t _gcry_digest_spec_sha1 = { GCRY_MD_SHA1, {0, 1}, "SHA1", asn, DIM (asn), oid_spec_sha1, 20, sha1_init, _gcry_md_block_write, sha1_final, sha1_read, NULL, _gcry_sha1_hash_buffer, _gcry_sha1_hash_buffers, sizeof (SHA1_CONTEXT), run_selftests }; diff --git a/cipher/sha1.h b/cipher/sha1.h index acf764ba..a3597658 100644 --- a/cipher/sha1.h +++ b/cipher/sha1.h @@ -1,35 +1,47 @@ /* sha1.h - SHA-1 context definition * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef GCRY_SHA1_H #define GCRY_SHA1_H #include "hash-common.h" + +/* SHA1_USE_S390X_CRYPTO indicates whether to enable zSeries code. */ +#undef SHA1_USE_S390X_CRYPTO +#if defined(HAVE_GCC_INLINE_ASM_S390X) +# define SHA1_USE_S390X_CRYPTO 1 +#endif /* SHA1_USE_S390X_CRYPTO */ + + /* We need this here for direct use by random-csprng.c. */ typedef struct { gcry_md_block_ctx_t bctx; u32 h0,h1,h2,h3,h4; +#ifdef SHA1_USE_S390X_CRYPTO + u32 final_len_msb, final_len_lsb; /* needs to be right after h4. */ + int use_s390x_crypto; +#endif } SHA1_CONTEXT; void _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd); unsigned int _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte); #endif /*GCRY_SHA1_H*/