diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 3518d95a..f0a2697e 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -1,1398 +1,1403 @@ /* chacha20.c - Bernstein's ChaCha20 cipher * Copyright (C) 2014,2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * For a description of the algorithm, see: * http://cr.yp.to/chacha.html */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #include #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "cipher-internal.h" #include "bufhelp.h" #define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ #define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ #define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ #define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_CTR_SIZE 16 /* Bytes. */ /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ #undef USE_AVX2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX2 1 #endif /* USE_AVX512 indicates whether to compile with Intel AVX512 code. */ #undef USE_AVX512 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX512 1 #endif /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */ #undef USE_ARMV7_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_ARMV7_NEON 1 # endif #endif /* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly * code. */ #undef USE_AARCH64_SIMD #ifdef ENABLE_NEON_SUPPORT # if defined(__AARCH64EL__) \ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) # define USE_AARCH64_SIMD 1 # endif #endif /* USE_PPC_VEC indicates whether to enable PowerPC vector * accelerated code. */ #undef USE_PPC_VEC #ifdef ENABLE_PPC_CRYPTO_SUPPORT # if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) # if __GNUC__ >= 4 # define USE_PPC_VEC 1 # endif # endif #endif /* USE_S390X_VX indicates whether to enable zSeries code. */ #undef USE_S390X_VX #if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 # if defined(HAVE_GCC_INLINE_ASM_S390X_VX) # define USE_S390X_VX 1 # endif /* USE_S390X_VX */ #endif /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) #else # define ASM_FUNC_ABI #endif typedef struct CHACHA20_context_s { u32 input[16]; unsigned char pad[CHACHA20_BLOCK_SIZE]; unsigned int unused; /* bytes in the pad. */ unsigned int use_ssse3:1; unsigned int use_avx2:1; unsigned int use_avx512:1; unsigned int use_neon:1; unsigned int use_ppc:1; unsigned int use_p10:1; unsigned int use_s390x:1; } CHACHA20_context_t; #ifdef USE_SSSE3 unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; #endif /* USE_SSSE3 */ #ifdef USE_AVX2 unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; #endif /* USE_AVX2 */ #ifdef USE_AVX512 unsigned int _gcry_chacha20_amd64_avx512_blocks16(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; #endif /* USE_AVX2 */ #ifdef USE_PPC_VEC #ifndef WORDS_BIGENDIAN unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len); #endif unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks); #undef USE_PPC_VEC_POLY1305 #if SIZEOF_UNSIGNED_LONG == 8 #define USE_PPC_VEC_POLY1305 1 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); #endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* USE_PPC_VEC */ #ifdef USE_S390X_VX unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst, const byte *src, size_t nblks); unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst, const byte *src, size_t nblks); #undef USE_S390X_VX_POLY1305 #if SIZEOF_UNSIGNED_LONG == 8 #define USE_S390X_VX_POLY1305 1 unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); #endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* USE_S390X_VX */ #ifdef USE_ARMV7_NEON unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); #endif /* USE_ARMV7_NEON */ #ifdef USE_AARCH64_SIMD unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); unsigned int _gcry_chacha20_poly1305_aarch64_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src); #endif /* USE_AARCH64_SIMD */ static const char *selftest (void); #define ROTATE(v,c) (rol(v,c)) #define XOR(v,w) ((v) ^ (w)) #define PLUS(v,w) ((u32)((v) + (w))) #define PLUSONE(v) (PLUS((v),1)) #define QUARTERROUND(a,b,c,d) \ a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); #define BUF_XOR_LE32(dst, src, offset, x) \ buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x)) static unsigned int do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) { u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; unsigned int i; while (nblks) { x0 = input[0]; x1 = input[1]; x2 = input[2]; x3 = input[3]; x4 = input[4]; x5 = input[5]; x6 = input[6]; x7 = input[7]; x8 = input[8]; x9 = input[9]; x10 = input[10]; x11 = input[11]; x12 = input[12]; x13 = input[13]; x14 = input[14]; x15 = input[15]; for (i = 20; i > 0; i -= 2) { QUARTERROUND(x0, x4, x8, x12) QUARTERROUND(x1, x5, x9, x13) QUARTERROUND(x2, x6, x10, x14) QUARTERROUND(x3, x7, x11, x15) QUARTERROUND(x0, x5, x10, x15) QUARTERROUND(x1, x6, x11, x12) QUARTERROUND(x2, x7, x8, x13) QUARTERROUND(x3, x4, x9, x14) } x0 = PLUS(x0, input[0]); x1 = PLUS(x1, input[1]); x2 = PLUS(x2, input[2]); x3 = PLUS(x3, input[3]); x4 = PLUS(x4, input[4]); x5 = PLUS(x5, input[5]); x6 = PLUS(x6, input[6]); x7 = PLUS(x7, input[7]); x8 = PLUS(x8, input[8]); x9 = PLUS(x9, input[9]); x10 = PLUS(x10, input[10]); x11 = PLUS(x11, input[11]); x12 = PLUS(x12, input[12]); x13 = PLUS(x13, input[13]); x14 = PLUS(x14, input[14]); x15 = PLUS(x15, input[15]); input[12] = PLUSONE(input[12]); input[13] = PLUS(input[13], !input[12]); BUF_XOR_LE32(dst, src, 0, x0); BUF_XOR_LE32(dst, src, 4, x1); BUF_XOR_LE32(dst, src, 8, x2); BUF_XOR_LE32(dst, src, 12, x3); BUF_XOR_LE32(dst, src, 16, x4); BUF_XOR_LE32(dst, src, 20, x5); BUF_XOR_LE32(dst, src, 24, x6); BUF_XOR_LE32(dst, src, 28, x7); BUF_XOR_LE32(dst, src, 32, x8); BUF_XOR_LE32(dst, src, 36, x9); BUF_XOR_LE32(dst, src, 40, x10); BUF_XOR_LE32(dst, src, 44, x11); BUF_XOR_LE32(dst, src, 48, x12); BUF_XOR_LE32(dst, src, 52, x13); BUF_XOR_LE32(dst, src, 56, x14); BUF_XOR_LE32(dst, src, 60, x15); src += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; nblks--; } /* burn_stack */ return (17 * sizeof(u32) + 6 * sizeof(void *)); } static unsigned int chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, size_t nblks) { #ifdef USE_SSSE3 if (ctx->use_ssse3) { return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks); } #endif #ifdef USE_PPC_VEC if (ctx->use_ppc) { return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks); } #endif #ifdef USE_S390X_VX if (ctx->use_s390x) { return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks); } #endif return do_chacha20_blocks (ctx->input, dst, src, nblks); } static void chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static const char sigma[16] = "expand 32-byte k"; static const char tau[16] = "expand 16-byte k"; const char *constants; ctx->input[4] = buf_get_le32(key + 0); ctx->input[5] = buf_get_le32(key + 4); ctx->input[6] = buf_get_le32(key + 8); ctx->input[7] = buf_get_le32(key + 12); if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */ { key += 16; constants = sigma; } else /* 128 bits */ { constants = tau; } ctx->input[8] = buf_get_le32(key + 0); ctx->input[9] = buf_get_le32(key + 4); ctx->input[10] = buf_get_le32(key + 8); ctx->input[11] = buf_get_le32(key + 12); ctx->input[0] = buf_get_le32(constants + 0); ctx->input[1] = buf_get_le32(constants + 4); ctx->input[2] = buf_get_le32(constants + 8); ctx->input[3] = buf_get_le32(constants + 12); } static void chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen) { if (ivlen == CHACHA20_CTR_SIZE) { ctx->input[12] = buf_get_le32 (iv + 0); ctx->input[13] = buf_get_le32 (iv + 4); ctx->input[14] = buf_get_le32 (iv + 8); ctx->input[15] = buf_get_le32 (iv + 12); } else if (ivlen == CHACHA20_MAX_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = buf_get_le32 (iv + 0); ctx->input[14] = buf_get_le32 (iv + 4); ctx->input[15] = buf_get_le32 (iv + 8); } else if (ivlen == CHACHA20_MIN_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = buf_get_le32 (iv + 0); ctx->input[15] = buf_get_le32 (iv + 4); } else { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = 0; ctx->input[15] = 0; } } static void chacha20_setiv (void *context, const byte *iv, size_t ivlen) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE && ivlen != CHACHA20_CTR_SIZE) log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE || ivlen == CHACHA20_CTR_SIZE)) chacha20_ivsetup (ctx, iv, ivlen); else chacha20_ivsetup (ctx, NULL, 0); /* Reset the unused pad bytes counter. */ ctx->unused = 0; } static gcry_err_code_t chacha20_do_setkey (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static int initialized; static const char *selftest_failed; unsigned int features = _gcry_get_hw_features (); if (!initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) return GPG_ERR_INV_KEYLEN; #ifdef USE_SSSE3 ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif #ifdef USE_AVX512 ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0; #endif #ifdef USE_AVX2 ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0; #endif #ifdef USE_ARMV7_NEON ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_AARCH64_SIMD ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_PPC_VEC ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; # ifndef WORDS_BIGENDIAN ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; +# ifdef ENABLE_FORCE_SOFT_HWFEATURES + /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. + * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ + ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0; +# endif # endif #endif #ifdef USE_S390X_VX ctx->use_s390x = (features & HWF_S390X_VX) != 0; #endif (void)features; chacha20_keysetup (ctx, key, keylen); /* We default to a zero nonce. */ chacha20_setiv (ctx, NULL, 0); return 0; } static gcry_err_code_t chacha20_setkey (void *context, const byte *key, unsigned int keylen, cipher_bulk_ops_t *bulk_ops) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen); (void)bulk_ops; _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); return rc; } static unsigned int do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, const byte *inbuf, size_t length) { static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, }; unsigned int nburn, burn = 0; #ifdef USE_AVX512 if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE * 16) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 16; nburn = _gcry_chacha20_amd64_avx512_blocks16(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AVX2 if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_ARMV7_NEON if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AARCH64_SIMD if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; #ifndef WORDS_BIGENDIAN /* * A workaround to skip counter overflow. This is rare. */ if (ctx->use_p10 && nblocks >= 8 && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU) { size_t len = nblocks * CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len); } else #endif { nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks); } burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length > 0) { nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, ctx->pad, length); ctx->unused = CHACHA20_BLOCK_SIZE - length; } if (burn) burn += 5 * sizeof(void *); return burn; } static void chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; unsigned int nburn, burn = 0; if (!length) return; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) return; gcry_assert (!ctx->unused); } nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length); burn = nburn > burn ? nburn : burn; if (burn) _gcry_burn_stack (burn); } gcry_err_code_t _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (void *) &c->context.c; unsigned int nburn, burn = 0; byte *authptr = NULL; if (!length) return 0; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n); burn = nburn > burn ? nburn : burn; length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) { if (burn) _gcry_burn_stack (burn); return 0; } gcry_assert (!ctx->unused); } gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); if (0) { } #ifdef USE_AVX512 else if (ctx->use_avx512) { /* Skip stitched chacha20-poly1305 for AVX512. */ authptr = NULL; } #endif #ifdef USE_AVX2 else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 8 * CHACHA20_BLOCK_SIZE; outbuf += 8 * CHACHA20_BLOCK_SIZE; inbuf += 8 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 2 * CHACHA20_BLOCK_SIZE; outbuf += 2 * CHACHA20_BLOCK_SIZE; inbuf += 2 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 1 * CHACHA20_BLOCK_SIZE; outbuf += 1 * CHACHA20_BLOCK_SIZE; inbuf += 1 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AARCH64_SIMD else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC_POLY1305 else if (ctx->use_ppc && ctx->use_p10) { /* Skip stitched chacha20-poly1305 for P10. */ authptr = NULL; } else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX_POLY1305 else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8) { nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 8 * CHACHA20_BLOCK_SIZE; outbuf += 8 * CHACHA20_BLOCK_SIZE; inbuf += 8 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2) { nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 2 * CHACHA20_BLOCK_SIZE; outbuf += 2 * CHACHA20_BLOCK_SIZE; inbuf += 2 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE) { nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 1 * CHACHA20_BLOCK_SIZE; outbuf += 1 * CHACHA20_BLOCK_SIZE; inbuf += 1 * CHACHA20_BLOCK_SIZE; } #endif if (authptr) { size_t authoffset = outbuf - authptr; #ifdef USE_AVX2 if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE && authoffset >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3) { if (length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE && authoffset >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } } #endif #ifdef USE_AARCH64_SIMD if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_aarch64_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC_POLY1305 if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_ppc8_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX_POLY1305 if (ctx->use_s390x) { if (length >= 8 * CHACHA20_BLOCK_SIZE && authoffset >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; burn = _gcry_chacha20_poly1305_s390x_vx_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE && authoffset >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } } #endif if (authoffset > 0) { _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset); authptr += authoffset; authoffset = 0; } gcry_assert(authptr == outbuf); } while (length) { size_t currlen = length; /* Since checksumming is done after encryption, process input in 24KiB * chunks to keep data loaded in L1 cache for checksumming. However * only do splitting if input is large enough so that last chunks does * not end up being short. */ if (currlen > 32 * 1024) currlen = 24 * 1024; nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen); burn = nburn > burn ? nburn : burn; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, currlen); burn = nburn > burn ? nburn : burn; outbuf += currlen; inbuf += currlen; length -= currlen; } if (burn) _gcry_burn_stack (burn); return 0; } gcry_err_code_t _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (void *) &c->context.c; unsigned int nburn, burn = 0; int skip_stitched = 0; if (!length) return 0; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) { if (burn) _gcry_burn_stack (burn); return 0; } gcry_assert (!ctx->unused); } gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); #ifdef USE_AVX512 if (ctx->use_avx512) { /* Skip stitched chacha20-poly1305 for AVX512. */ skip_stitched = 1; } #endif #ifdef USE_PPC_VEC_POLY1305 if (ctx->use_ppc && ctx->use_p10) { /* Skip stitched chacha20-poly1305 for P10. */ skip_stitched = 1; } #endif #ifdef USE_AVX2 if (!skip_stitched && ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (!skip_stitched && ctx->use_ssse3) { if (length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } } #endif #ifdef USE_AARCH64_SIMD if (!skip_stitched && ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_aarch64_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC_POLY1305 /* skip stitch for p10 */ if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_ppc8_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX_POLY1305 if (!skip_stitched && ctx->use_s390x) { if (length >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } } #endif while (length) { size_t currlen = length; /* Since checksumming is done before decryption, process input in 24KiB * chunks to keep data loaded in L1 cache for decryption. However only * do splitting if input is large enough so that last chunks does not * end up being short. */ if (currlen > 32 * 1024) currlen = 24 * 1024; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, currlen); burn = nburn > burn ? nburn : burn; nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen); burn = nburn > burn ? nburn : burn; outbuf += currlen; inbuf += currlen; length -= currlen; } if (burn) _gcry_burn_stack (burn); return 0; } static const char * selftest (void) { byte ctxbuf[sizeof(CHACHA20_context_t) + 15]; CHACHA20_context_t *ctx; byte scratch[127 + 1]; byte buf[512 + 64 + 4]; int i; /* From draft-strombergson-chacha-test-vectors */ static byte key_1[] = { 0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78, 0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35, 0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb, 0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d }; static const byte nonce_1[] = { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 }; static const byte plaintext_1[127] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const byte ciphertext_1[127] = { 0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9, 0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06, 0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00, 0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf, 0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd, 0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f, 0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f, 0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92, 0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9, 0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36, 0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1, 0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38, 0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea, 0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0, 0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27, 0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33 }; /* 16-byte alignment required for amd64 implementation. */ ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15); chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); scratch[sizeof (scratch) - 1] = 0; chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1); if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) return "ChaCha20 encryption test 1 failed."; if (scratch[sizeof (scratch) - 1]) return "ChaCha20 wrote too much."; chacha20_setkey (ctx, key_1, sizeof (key_1), NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1); if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) return "ChaCha20 decryption test 1 failed."; for (i = 0; i < sizeof buf; i++) buf[i] = i; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /*encrypt */ chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); /*decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, 1); chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1); chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1, buf + (sizeof buf) - 1, 1); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 2 failed."; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /* encrypt */ for (i = 0; i < sizeof buf; i++) chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1); /* decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 3 failed."; return NULL; } gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = { GCRY_CIPHER_CHACHA20, {0, 0}, /* flags */ "CHACHA20", /* name */ NULL, /* aliases */ NULL, /* oids */ 1, /* blocksize in bytes. */ CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */ sizeof (CHACHA20_context_t), chacha20_setkey, NULL, NULL, chacha20_encrypt_stream, chacha20_encrypt_stream, NULL, NULL, chacha20_setiv }; diff --git a/cipher/poly1305.c b/cipher/poly1305.c index f5f3f85a..b5f5a19b 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -1,838 +1,846 @@ /* poly1305.c - Poly1305 internals and generic implementation * Copyright (C) 2014,2017,2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "poly1305-internal.h" #include "mpi-internal.h" #include "longlong.h" static const char *selftest (void); #undef HAVE_ASM_POLY1305_BLOCKS #undef USE_MPI_64BIT #undef USE_MPI_32BIT #if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_TYPE_U64) # define USE_MPI_64BIT 1 #elif BYTES_PER_MPI_LIMB == 4 # define USE_MPI_32BIT 1 #else # error please implement for this limb size. #endif /* USE_S390X_ASM indicates whether to enable zSeries code. */ #undef USE_S390X_ASM #if BYTES_PER_MPI_LIMB == 8 # if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 # if defined(HAVE_GCC_INLINE_ASM_S390X) # define USE_S390X_ASM 1 # endif /* USE_S390X_ASM */ # endif #endif /* AMD64 Assembly implementations use SystemV ABI, ABI conversion and * additional stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_FUNC_WRAPPER_ATTR #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) # define ASM_FUNC_WRAPPER_ATTR __attribute__((noinline)) #else # define ASM_FUNC_ABI # define ASM_FUNC_WRAPPER_ATTR #endif #ifdef USE_S390X_ASM #define HAVE_ASM_POLY1305_BLOCKS 1 extern unsigned int _gcry_poly1305_s390x_blocks1(void *state, const byte *buf, size_t len, byte high_pad); static unsigned int poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) { return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad); } #endif /* USE_S390X_ASM */ #ifdef POLY1305_USE_AVX512 extern unsigned int _gcry_poly1305_amd64_avx512_blocks(const void *msg, const u64 msg_len, void *hash, const void *key) ASM_FUNC_ABI; ASM_FUNC_WRAPPER_ATTR static unsigned int poly1305_amd64_avx512_blocks(poly1305_context_t *ctx, const byte *buf, size_t len) { POLY1305_STATE *st = &ctx->state; return _gcry_poly1305_amd64_avx512_blocks(buf, len, st->h, st->r); } #endif /* POLY1305_USE_AVX512 */ #ifdef POLY1305_USE_PPC_VEC extern unsigned int gcry_poly1305_p10le_4blocks(unsigned char *key, const byte *m, size_t len); #endif /* POLY1305_USE_PPC_VEC */ static void poly1305_init (poly1305_context_t *ctx, const byte key[POLY1305_KEYLEN]) { POLY1305_STATE *st = &ctx->state; + unsigned int features = _gcry_get_hw_features (); #ifdef POLY1305_USE_AVX512 - ctx->use_avx512 = (_gcry_get_hw_features () & HWF_INTEL_AVX512) != 0; + ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0; #endif #ifdef POLY1305_USE_PPC_VEC - ctx->use_p10 = (_gcry_get_hw_features () & HWF_PPC_ARCH_3_10) != 0; + ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; +# ifdef ENABLE_FORCE_SOFT_HWFEATURES + /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. + * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ + ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0; +# endif #endif + (void)features; + ctx->leftover = 0; st->h[0] = 0; st->h[1] = 0; st->h[2] = 0; st->h[3] = 0; st->h[4] = 0; st->r[0] = buf_get_le32(key + 0) & 0x0fffffff; st->r[1] = buf_get_le32(key + 4) & 0x0ffffffc; st->r[2] = buf_get_le32(key + 8) & 0x0ffffffc; st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc; st->k[0] = buf_get_le32(key + 16); st->k[1] = buf_get_le32(key + 20); st->k[2] = buf_get_le32(key + 24); st->k[3] = buf_get_le32(key + 28); } #ifdef USE_MPI_64BIT #if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4 /* A += B (armv8/aarch64) */ #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ __asm__ ("adds %0, %3, %0\n" \ "adcs %1, %4, %1\n" \ "adc %2, %5, %2\n" \ : "+r" (A0), "+r" (A1), "+r" (A2) \ : "r" (B0), "r" (B1), "r" (B2) \ : "cc" ) #endif /* __aarch64__ */ #if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4 /* A += B (x86-64) */ #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ __asm__ ("addq %3, %0\n" \ "adcq %4, %1\n" \ "adcq %5, %2\n" \ : "+r" (A0), "+r" (A1), "+r" (A2) \ : "g" (B0), "g" (B1), "g" (B2) \ : "cc" ) #endif /* __x86_64__ */ #if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4 /* A += B (ppc64) */ #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ __asm__ ("addc %0, %3, %0\n" \ "adde %1, %4, %1\n" \ "adde %2, %5, %2\n" \ : "+r" (A0), "+r" (A1), "+r" (A2) \ : "r" (B0), "r" (B1), "r" (B2) \ : "cc" ) #endif /* __powerpc__ */ #ifndef ADD_1305_64 /* A += B (generic, mpi) */ # define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \ u64 carry; \ add_ssaaaa(carry, A0, 0, A0, 0, B0); \ add_ssaaaa(A2, A1, A2, A1, B2, B1); \ add_ssaaaa(A2, A1, A2, A1, 0, carry); \ } while (0) #endif /* H = H * R mod 2¹³⁰-5 */ #define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \ u64 x0_lo, x0_hi, x1_lo, x1_hi; \ u64 t0_lo, t0_hi, t1_lo, t1_hi; \ \ /* x = a * r (partial mod 2^130-5) */ \ umul_ppmm(x0_hi, x0_lo, H0, R0); /* h0 * r0 */ \ umul_ppmm(x1_hi, x1_lo, H0, R1); /* h0 * r1 */ \ \ umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \ add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \ umul_ppmm(t1_hi, t1_lo, H1, R0); /* h1 * r0 */ \ add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \ \ t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \ t1_hi = H2 * R0; /* h2 * r0 */ \ add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \ \ /* carry propagation */ \ H2 = H0 & 3; \ H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \ ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \ } while (0) #ifndef HAVE_ASM_POLY1305_BLOCKS static unsigned int poly1305_blocks_generic (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) { POLY1305_STATE *st = &ctx->state; u64 r0, r1, r1_mult5; u64 h0, h1, h2; u64 m0, m1, m2; m2 = high_pad; h0 = st->h[0] + ((u64)st->h[1] << 32); h1 = st->h[2] + ((u64)st->h[3] << 32); h2 = st->h[4]; r0 = st->r[0] + ((u64)st->r[1] << 32); r1 = st->r[2] + ((u64)st->r[3] << 32); r1_mult5 = (r1 >> 2) + r1; m0 = buf_get_le64(buf + 0); m1 = buf_get_le64(buf + 8); buf += POLY1305_BLOCKSIZE; len -= POLY1305_BLOCKSIZE; while (len >= POLY1305_BLOCKSIZE) { /* a = h + m */ ADD_1305_64(h2, h1, h0, m2, m1, m0); m0 = buf_get_le64(buf + 0); m1 = buf_get_le64(buf + 8); /* h = a * r (partial mod 2^130-5) */ MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5); buf += POLY1305_BLOCKSIZE; len -= POLY1305_BLOCKSIZE; } /* a = h + m */ ADD_1305_64(h2, h1, h0, m2, m1, m0); /* h = a * r (partial mod 2^130-5) */ MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5); st->h[0] = h0; st->h[1] = h0 >> 32; st->h[2] = h1; st->h[3] = h1 >> 32; st->h[4] = h2; return 6 * sizeof (void *) + 18 * sizeof (u64); } static unsigned int poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) { #ifdef POLY1305_USE_AVX512 if ((high_pad & ctx->use_avx512) != 0) return poly1305_amd64_avx512_blocks(ctx, buf, len); #endif return poly1305_blocks_generic(ctx, buf, len, high_pad); } #endif /* !HAVE_ASM_POLY1305_BLOCKS */ static unsigned int poly1305_final (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN]) { POLY1305_STATE *st = &ctx->state; unsigned int burn = 0; u64 u, carry; u64 k0, k1; u64 h0, h1; u64 h2; /* process the remaining block */ if (ctx->leftover) { ctx->buffer[ctx->leftover++] = 1; if (ctx->leftover < POLY1305_BLOCKSIZE) { memset (&ctx->buffer[ctx->leftover], 0, POLY1305_BLOCKSIZE - ctx->leftover); ctx->leftover = POLY1305_BLOCKSIZE; } burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0); } h0 = st->h[0] + ((u64)st->h[1] << 32); h1 = st->h[2] + ((u64)st->h[3] << 32); h2 = st->h[4]; k0 = st->k[0] + ((u64)st->k[1] << 32); k1 = st->k[2] + ((u64)st->k[3] << 32); /* check if h is more than 2^130-5, by adding 5. */ add_ssaaaa(carry, u, 0, h0, 0, 5); add_ssaaaa(carry, u, 0, carry, 0, h1); u = (carry + h2) >> 2; /* u == 0 or 1 */ /* minus 2^130-5 ... (+5) */ u = (-u) & 5; add_ssaaaa(h1, h0, h1, h0, 0, u); /* add high part of key + h */ add_ssaaaa(h1, h0, h1, h0, k1, k0); buf_put_le64(mac + 0, h0); buf_put_le64(mac + 8, h1); /* burn_stack */ return 4 * sizeof (void *) + 7 * sizeof (u64) + burn; } #endif /* USE_MPI_64BIT */ #ifdef USE_MPI_32BIT #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS /* HI:LO += A * B (arm) */ #define UMUL_ADD_32(HI, LO, A, B) \ __asm__ ("umlal %1, %0, %4, %5" \ : "=r" (HI), "=r" (LO) \ : "0" (HI), "1" (LO), "r" (A), "r" (B) ) /* A += B (arm) */ #ifdef __GCC_ASM_FLAG_OUTPUTS__ # define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \ u32 __carry; \ __asm__ ("adds %0, %0, %5\n" \ "adcs %1, %1, %6\n" \ "adcs %2, %2, %7\n" \ "adcs %3, %3, %8\n" \ : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), \ "=@cccs" (__carry) \ : "r" (B0), "r" (B1), "r" (B2), "r" (B3) \ : ); \ (A4) += (B4) + __carry; \ } while (0) #else # define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \ u32 __carry = (B0); \ __asm__ ("adds %0, %0, %2\n" \ "adcs %1, %1, %3\n" \ "rrx %2, %2\n" /* carry to 31th bit */ \ : "+r" (A0), "+r" (A1), "+r" (__carry) \ : "r" (B1), "r" (0) \ : "cc" ); \ __asm__ ("lsls %0, %0, #1\n" /* carry from 31th bit */ \ "adcs %1, %1, %4\n" \ "adcs %2, %2, %5\n" \ "adc %3, %3, %6\n" \ : "+r" (__carry), "+r" (A2), "+r" (A3), "+r" (A4) \ : "r" (B2), "r" (B3), "r" (B4) \ : "cc" ); \ } while (0) #endif #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */ #if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 5 /* Note: ADD_1305_32 below does not compile on GCC-4.7 */ /* A += B (i386) */ #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \ __asm__ ("addl %5, %0\n" \ "adcl %6, %1\n" \ "adcl %7, %2\n" \ "adcl %8, %3\n" \ "adcl %9, %4\n" \ : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \ : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \ : "cc" ) #endif /* __i386__ */ #ifndef UMUL_ADD_32 /* HI:LO += A * B (generic, mpi) */ # define UMUL_ADD_32(HI, LO, A, B) do { \ u32 t_lo, t_hi; \ umul_ppmm(t_hi, t_lo, A, B); \ add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \ } while (0) #endif #ifndef ADD_1305_32 /* A += B (generic, mpi) */ # define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \ u32 carry0, carry1, carry2; \ add_ssaaaa(carry0, A0, 0, A0, 0, B0); \ add_ssaaaa(carry1, A1, 0, A1, 0, B1); \ add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \ add_ssaaaa(carry2, A2, 0, A2, 0, B2); \ add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \ add_ssaaaa(A4, A3, A4, A3, B4, B3); \ add_ssaaaa(A4, A3, A4, A3, 0, carry2); \ } while (0) #endif /* H = H * R mod 2¹³⁰-5 */ #define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \ R3_MULT5, R2_MULT5, R1_MULT5) do { \ u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \ u32 t0_lo, t0_hi; \ \ /* x = a * r (partial mod 2^130-5) */ \ umul_ppmm(x0_hi, x0_lo, H0, R0); /* h0 * r0 */ \ umul_ppmm(x1_hi, x1_lo, H0, R1); /* h0 * r1 */ \ umul_ppmm(x2_hi, x2_lo, H0, R2); /* h0 * r2 */ \ umul_ppmm(x3_hi, x3_lo, H0, R3); /* h0 * r3 */ \ \ UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \ UMUL_ADD_32(x1_hi, x1_lo, H1, R0); /* h1 * r0 */ \ UMUL_ADD_32(x2_hi, x2_lo, H1, R1); /* h1 * r1 */ \ UMUL_ADD_32(x3_hi, x3_lo, H1, R2); /* h1 * r2 */ \ \ UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \ UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \ UMUL_ADD_32(x2_hi, x2_lo, H2, R0); /* h2 * r0 */ \ UMUL_ADD_32(x3_hi, x3_lo, H2, R1); /* h2 * r1 */ \ \ UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \ H1 = x0_hi; \ UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \ UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \ UMUL_ADD_32(x3_hi, x3_lo, H3, R0); /* h3 * r0 */ \ \ t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \ t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \ add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \ add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \ t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \ t0_hi = H4 * R0; /* h4 * r0 */ \ add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \ \ /* carry propagation */ \ H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \ H4 = H4 & 3; \ ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \ } while (0) #ifndef HAVE_ASM_POLY1305_BLOCKS static unsigned int poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) { POLY1305_STATE *st = &ctx->state; u32 r1_mult5, r2_mult5, r3_mult5; u32 h0, h1, h2, h3, h4; u32 m0, m1, m2, m3, m4; m4 = high_pad; h0 = st->h[0]; h1 = st->h[1]; h2 = st->h[2]; h3 = st->h[3]; h4 = st->h[4]; r1_mult5 = (st->r[1] >> 2) + st->r[1]; r2_mult5 = (st->r[2] >> 2) + st->r[2]; r3_mult5 = (st->r[3] >> 2) + st->r[3]; while (len >= POLY1305_BLOCKSIZE) { m0 = buf_get_le32(buf + 0); m1 = buf_get_le32(buf + 4); m2 = buf_get_le32(buf + 8); m3 = buf_get_le32(buf + 12); /* a = h + m */ ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0); /* h = a * r (partial mod 2^130-5) */ MUL_MOD_1305_32(h4, h3, h2, h1, h0, st->r[3], st->r[2], st->r[1], st->r[0], r3_mult5, r2_mult5, r1_mult5); buf += POLY1305_BLOCKSIZE; len -= POLY1305_BLOCKSIZE; } st->h[0] = h0; st->h[1] = h1; st->h[2] = h2; st->h[3] = h3; st->h[4] = h4; return 6 * sizeof (void *) + 28 * sizeof (u32); } #endif /* !HAVE_ASM_POLY1305_BLOCKS */ static unsigned int poly1305_final (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN]) { POLY1305_STATE *st = &ctx->state; unsigned int burn = 0; u32 carry, tmp0, tmp1, tmp2, u; u32 h4, h3, h2, h1, h0; /* process the remaining block */ if (ctx->leftover) { ctx->buffer[ctx->leftover++] = 1; if (ctx->leftover < POLY1305_BLOCKSIZE) { memset (&ctx->buffer[ctx->leftover], 0, POLY1305_BLOCKSIZE - ctx->leftover); ctx->leftover = POLY1305_BLOCKSIZE; } burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0); } h0 = st->h[0]; h1 = st->h[1]; h2 = st->h[2]; h3 = st->h[3]; h4 = st->h[4]; /* check if h is more than 2^130-5, by adding 5. */ add_ssaaaa(carry, tmp0, 0, h0, 0, 5); add_ssaaaa(carry, tmp0, 0, carry, 0, h1); add_ssaaaa(carry, tmp0, 0, carry, 0, h2); add_ssaaaa(carry, tmp0, 0, carry, 0, h3); u = (carry + h4) >> 2; /* u == 0 or 1 */ /* minus 2^130-5 ... (+5) */ u = (-u) & 5; add_ssaaaa(carry, h0, 0, h0, 0, u); add_ssaaaa(carry, h1, 0, h1, 0, carry); add_ssaaaa(carry, h2, 0, h2, 0, carry); add_ssaaaa(carry, h3, 0, h3, 0, carry); /* add high part of key + h */ add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]); add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]); add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0); add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]); add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1); add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]); h3 += tmp2; buf_put_le32(mac + 0, h0); buf_put_le32(mac + 4, h1); buf_put_le32(mac + 8, h2); buf_put_le32(mac + 12, h3); /* burn_stack */ return 4 * sizeof (void *) + 10 * sizeof (u32) + burn; } #endif /* USE_MPI_32BIT */ unsigned int _gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m, size_t bytes) { unsigned int burn = 0; unsigned int nburn; /* handle leftover */ if (ctx->leftover) { size_t want = (POLY1305_BLOCKSIZE - ctx->leftover); if (want > bytes) want = bytes; buf_cpy (ctx->buffer + ctx->leftover, m, want); bytes -= want; m += want; ctx->leftover += want; if (ctx->leftover < POLY1305_BLOCKSIZE) return 0; nburn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1); burn = nburn > burn ? nburn : burn; ctx->leftover = 0; } #ifdef POLY1305_USE_PPC_VEC /* PPC-P10/little-endian: bulk process multiples of eight blocks */ if (ctx->use_p10 && bytes >= POLY1305_BLOCKSIZE * 8) { size_t nblks = bytes / (POLY1305_BLOCKSIZE * 8); size_t len = nblks * (POLY1305_BLOCKSIZE * 8); POLY1305_STATE *st = &ctx->state; nburn = gcry_poly1305_p10le_4blocks ((unsigned char *) st, m, len); burn = nburn > burn ? nburn : burn; m += len; bytes -= len; } #endif /* POLY1305_USE_PPC_VEC */ /* process full blocks */ if (bytes >= POLY1305_BLOCKSIZE) { size_t nblks = bytes / POLY1305_BLOCKSIZE; nburn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1); burn = nburn > burn ? nburn : burn; m += nblks * POLY1305_BLOCKSIZE; bytes -= nblks * POLY1305_BLOCKSIZE; } /* store leftover */ if (bytes) { buf_cpy (ctx->buffer + ctx->leftover, m, bytes); ctx->leftover += bytes; } return burn; } void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes) { unsigned int burn; burn = _gcry_poly1305_update_burn (ctx, m, bytes); if (burn) _gcry_burn_stack (burn); } void _gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN]) { unsigned int burn; burn = poly1305_final (ctx, mac); _gcry_burn_stack (burn); } gcry_err_code_t _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, size_t keylen) { static int initialized; static const char *selftest_failed; if (!initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("Poly1305 selftest failed (%s)\n", selftest_failed); } if (keylen != POLY1305_KEYLEN) return GPG_ERR_INV_KEYLEN; if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; poly1305_init (ctx, key); return 0; } static void poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes, const byte * key) { poly1305_context_t ctx; memset (&ctx, 0, sizeof (ctx)); _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN); _gcry_poly1305_update (&ctx, m, bytes); _gcry_poly1305_finish (&ctx, mac); wipememory (&ctx, sizeof (ctx)); } static const char * selftest (void) { /* example from nacl */ static const byte nacl_key[POLY1305_KEYLEN] = { 0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91, 0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25, 0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65, 0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80, }; static const byte nacl_msg[131] = { 0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73, 0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce, 0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4, 0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a, 0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b, 0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72, 0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2, 0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38, 0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a, 0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae, 0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea, 0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda, 0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde, 0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3, 0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6, 0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74, 0xe3, 0x55, 0xa5 }; static const byte nacl_mac[16] = { 0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5, 0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9 }; /* generates a final value of (2^130 - 2) == 3 */ static const byte wrap_key[POLY1305_KEYLEN] = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const byte wrap_msg[16] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static const byte wrap_mac[16] = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; /* mac of the macs of messages of length 0 to 256, where the key and messages * have all their values set to the length */ static const byte total_key[POLY1305_KEYLEN] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static const byte total_mac[16] = { 0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd, 0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39 }; poly1305_context_t ctx; poly1305_context_t total_ctx; byte all_key[POLY1305_KEYLEN]; byte all_msg[256]; byte mac[16]; size_t i, j; memset (&ctx, 0, sizeof (ctx)); memset (&total_ctx, 0, sizeof (total_ctx)); memset (mac, 0, sizeof (mac)); poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key); if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) return "Poly1305 test 1 failed."; /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so * make sure everything still works varying between them */ memset (mac, 0, sizeof (mac)); _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN); _gcry_poly1305_update (&ctx, nacl_msg + 0, 32); _gcry_poly1305_update (&ctx, nacl_msg + 32, 64); _gcry_poly1305_update (&ctx, nacl_msg + 96, 16); _gcry_poly1305_update (&ctx, nacl_msg + 112, 8); _gcry_poly1305_update (&ctx, nacl_msg + 120, 4); _gcry_poly1305_update (&ctx, nacl_msg + 124, 2); _gcry_poly1305_update (&ctx, nacl_msg + 126, 1); _gcry_poly1305_update (&ctx, nacl_msg + 127, 1); _gcry_poly1305_update (&ctx, nacl_msg + 128, 1); _gcry_poly1305_update (&ctx, nacl_msg + 129, 1); _gcry_poly1305_update (&ctx, nacl_msg + 130, 1); _gcry_poly1305_finish (&ctx, mac); if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) return "Poly1305 test 2 failed."; memset (mac, 0, sizeof (mac)); poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key); if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0) return "Poly1305 test 3 failed."; _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN); for (i = 0; i < 256; i++) { /* set key and message to 'i,i,i..' */ for (j = 0; j < sizeof (all_key); j++) all_key[j] = i; for (j = 0; j < i; j++) all_msg[j] = i; poly1305_auth (mac, all_msg, i, all_key); _gcry_poly1305_update (&total_ctx, mac, 16); } _gcry_poly1305_finish (&total_ctx, mac); if (memcmp (total_mac, mac, sizeof (total_mac)) != 0) return "Poly1305 test 4 failed."; return NULL; } diff --git a/cipher/rijndael.c b/cipher/rijndael.c index dddcbc54..7e75ddd2 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -1,2026 +1,2032 @@ /* Rijndael (AES) for GnuPG * Copyright (C) 2000, 2001, 2002, 2003, 2007, * 2008, 2011, 2012 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . ******************************************************************* * The code here is based on the optimized implementation taken from * http://www.esat.kuleuven.ac.be/~rijmen/rijndael/ on Oct 2, 2000, * which carries this notice: *------------------------------------------ * rijndael-alg-fst.c v2.3 April '2000 * * Optimised ANSI C code * * authors: v1.0: Antoon Bosselaers * v2.0: Vincent Rijmen * v2.3: Paulo Barreto * * This code is placed in the public domain. *------------------------------------------ * * The SP800-38a document is available at: * http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf * */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_AMD64_ASM /* AMD64 assembly implementations of AES */ extern unsigned int _gcry_aes_amd64_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, int rounds, const void *encT); extern unsigned int _gcry_aes_amd64_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, int rounds, const void *decT); #endif /*USE_AMD64_ASM*/ #ifdef USE_AESNI /* AES-NI (AMD64 & i386) accelerated implementations of AES */ extern void _gcry_aes_aesni_do_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_aesni_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_aesni_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); extern void _gcry_aes_aesni_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_aesni_ctr32le_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_aesni_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_aesni_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern size_t _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); #endif #ifdef USE_VAES /* VAES (AMD64) accelerated implementation of AES */ extern void _gcry_aes_vaes_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_vaes_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_vaes_ctr32le_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); #endif #ifdef USE_SSSE3 /* SSSE3 (AMD64) vector permutation implementation of AES */ extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_ssse3_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_ssse3_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ssse3_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); extern void _gcry_aes_ssse3_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ssse3_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ssse3_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern size_t _gcry_aes_ssse3_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern size_t _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); #endif #ifdef USE_PADLOCK extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); extern unsigned int _gcry_aes_padlock_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); extern void _gcry_aes_padlock_prepare_decryption (RIJNDAEL_context *ctx); #endif #ifdef USE_ARM_ASM /* ARM assembly implementations of AES */ extern unsigned int _gcry_aes_arm_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, int rounds, const void *encT); extern unsigned int _gcry_aes_arm_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, int rounds, const void *decT); #endif /*USE_ARM_ASM*/ #ifdef USE_ARM_CE /* ARMv8 Crypto Extension implementations of AES */ extern void _gcry_aes_armv8_ce_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_armv8_ce_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_armv8_ce_encrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_armv8_ce_decrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_armv8_ce_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_armv8_ce_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); extern void _gcry_aes_armv8_ce_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_armv8_ce_ctr32le_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_armv8_ce_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_armv8_ce_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern size_t _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern size_t _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); #endif /*USE_ARM_ASM*/ #ifdef USE_PPC_CRYPTO /* PowerPC Crypto implementations of AES */ extern void _gcry_aes_ppc8_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_ppc8_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_ppc8_encrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_ppc8_decrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ppc8_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); extern void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); #endif /*USE_PPC_CRYPTO*/ #ifdef USE_PPC_CRYPTO_WITH_PPC9LE /* Power9 little-endian crypto implementations of AES */ extern unsigned int _gcry_aes_ppc9le_encrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_ppc9le_decrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_ppc9le_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ppc9le_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); extern void _gcry_aes_ppc9le_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ppc9le_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern void _gcry_aes_ppc9le_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); extern size_t _gcry_aes_ppc9le_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern size_t _gcry_aes_ppc9le_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); #endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/ #ifdef USE_S390X_CRYPTO /* zSeries crypto implementations of AES */ extern int _gcry_aes_s390x_setup_acceleration(RIJNDAEL_context *ctx, unsigned int keylen, unsigned int hwfeatures, cipher_bulk_ops_t *bulk_ops); extern void _gcry_aes_s390x_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_s390x_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_s390x_encrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_s390x_decrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); #endif /*USE_S390X_CRYPTO*/ static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); static void _gcry_aes_cfb_enc (void *context, unsigned char *iv, void *outbuf, const void *inbuf, size_t nblocks); static void _gcry_aes_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_aes_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); static void _gcry_aes_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_aes_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); static size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); static void _gcry_aes_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); /* All the numbers. */ #include "rijndael-tables.h" /* Function prototypes. */ static const char *selftest(void); static void prepare_decryption(RIJNDAEL_context *ctx); /* Prefetching for encryption/decryption tables. */ static inline void prefetch_table(const volatile byte *tab, size_t len) { size_t i; for (i = 0; len - i >= 8 * 32; i += 8 * 32) { (void)tab[i + 0 * 32]; (void)tab[i + 1 * 32]; (void)tab[i + 2 * 32]; (void)tab[i + 3 * 32]; (void)tab[i + 4 * 32]; (void)tab[i + 5 * 32]; (void)tab[i + 6 * 32]; (void)tab[i + 7 * 32]; } for (; i < len; i += 32) { (void)tab[i]; } (void)tab[len - 1]; } static void prefetch_enc(void) { /* Modify counters to trigger copy-on-write and unsharing if physical pages * of look-up table are shared between processes. Modifying counters also * causes checksums for pages to change and hint same-page merging algorithm * that these pages are frequently changing. */ enc_tables.counter_head++; enc_tables.counter_tail++; /* Prefetch look-up tables to cache. */ prefetch_table((const void *)&enc_tables, sizeof(enc_tables)); } static void prefetch_dec(void) { /* Modify counters to trigger copy-on-write and unsharing if physical pages * of look-up table are shared between processes. Modifying counters also * causes checksums for pages to change and hint same-page merging algorithm * that these pages are frequently changing. */ dec_tables.counter_head++; dec_tables.counter_tail++; /* Prefetch look-up tables to cache. */ prefetch_table((const void *)&dec_tables, sizeof(dec_tables)); } /* Perform the key setup. */ static gcry_err_code_t do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, cipher_bulk_ops_t *bulk_ops) { static int initialized = 0; static const char *selftest_failed = 0; void (*hw_setkey)(RIJNDAEL_context *ctx, const byte *key) = NULL; int rounds; int i,j, r, t, rconpointer = 0; int KC; unsigned int hwfeatures; /* The on-the-fly self tests are only run in non-fips mode. In fips mode explicit self-tests are required. Actually the on-the-fly self-tests are not fully thread-safe and it might happen that a failed self-test won't get noticed in another thread. FIXME: We might want to have a central registry of succeeded self-tests. */ if (!fips_mode () && !initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("%s\n", selftest_failed ); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if( keylen == 128/8 ) { rounds = 10; KC = 4; } else if ( keylen == 192/8 ) { rounds = 12; KC = 6; } else if ( keylen == 256/8 ) { rounds = 14; KC = 8; } else return GPG_ERR_INV_KEYLEN; ctx->rounds = rounds; hwfeatures = _gcry_get_hw_features (); ctx->decryption_prepared = 0; /* Setup default bulk encryption routines. */ memset (bulk_ops, 0, sizeof(*bulk_ops)); bulk_ops->cfb_enc = _gcry_aes_cfb_enc; bulk_ops->cfb_dec = _gcry_aes_cfb_dec; bulk_ops->cbc_enc = _gcry_aes_cbc_enc; bulk_ops->cbc_dec = _gcry_aes_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_ctr_enc; bulk_ops->ocb_crypt = _gcry_aes_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_xts_crypt; (void)hwfeatures; if (0) { ; } #ifdef USE_AESNI else if (hwfeatures & HWF_INTEL_AESNI) { hw_setkey = _gcry_aes_aesni_do_setkey; ctx->encrypt_fn = _gcry_aes_aesni_encrypt; ctx->decrypt_fn = _gcry_aes_aesni_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_aesni_prepare_decryption; ctx->use_avx = !!(hwfeatures & HWF_INTEL_AVX); ctx->use_avx2 = !!(hwfeatures & HWF_INTEL_AVX2); /* Setup AES-NI bulk encryption routines. */ bulk_ops->cfb_enc = _gcry_aes_aesni_cfb_enc; bulk_ops->cfb_dec = _gcry_aes_aesni_cfb_dec; bulk_ops->cbc_enc = _gcry_aes_aesni_cbc_enc; bulk_ops->cbc_dec = _gcry_aes_aesni_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_aesni_ctr_enc; bulk_ops->ctr32le_enc = _gcry_aes_aesni_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt; #ifdef USE_VAES if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) && (hwfeatures & HWF_INTEL_AVX2)) { /* Setup VAES bulk encryption routines. */ bulk_ops->cfb_dec = _gcry_aes_vaes_cfb_dec; bulk_ops->cbc_dec = _gcry_aes_vaes_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_vaes_ctr_enc; bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt; bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt; } #endif } #endif #ifdef USE_PADLOCK else if ((hwfeatures & HWF_PADLOCK_AES) && keylen == 128/8) { ctx->encrypt_fn = _gcry_aes_padlock_encrypt; ctx->decrypt_fn = _gcry_aes_padlock_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_padlock_prepare_decryption; memcpy (ctx->padlockkey, key, keylen); } #endif #ifdef USE_SSSE3 else if (hwfeatures & HWF_INTEL_SSSE3) { hw_setkey = _gcry_aes_ssse3_do_setkey; ctx->encrypt_fn = _gcry_aes_ssse3_encrypt; ctx->decrypt_fn = _gcry_aes_ssse3_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_ssse3_prepare_decryption; /* Setup SSSE3 bulk encryption routines. */ bulk_ops->cfb_enc = _gcry_aes_ssse3_cfb_enc; bulk_ops->cfb_dec = _gcry_aes_ssse3_cfb_dec; bulk_ops->cbc_enc = _gcry_aes_ssse3_cbc_enc; bulk_ops->cbc_dec = _gcry_aes_ssse3_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_ssse3_ctr_enc; bulk_ops->ocb_crypt = _gcry_aes_ssse3_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_ssse3_ocb_auth; } #endif #ifdef USE_ARM_CE else if (hwfeatures & HWF_ARM_AES) { hw_setkey = _gcry_aes_armv8_ce_setkey; ctx->encrypt_fn = _gcry_aes_armv8_ce_encrypt; ctx->decrypt_fn = _gcry_aes_armv8_ce_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_armv8_ce_prepare_decryption; /* Setup ARM-CE bulk encryption routines. */ bulk_ops->cfb_enc = _gcry_aes_armv8_ce_cfb_enc; bulk_ops->cfb_dec = _gcry_aes_armv8_ce_cfb_dec; bulk_ops->cbc_enc = _gcry_aes_armv8_ce_cbc_enc; bulk_ops->cbc_dec = _gcry_aes_armv8_ce_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_armv8_ce_ctr_enc; bulk_ops->ctr32le_enc = _gcry_aes_armv8_ce_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt; } #endif #ifdef USE_PPC_CRYPTO_WITH_PPC9LE else if ((hwfeatures & HWF_PPC_VCRYPTO) && (hwfeatures & HWF_PPC_ARCH_3_00)) { hw_setkey = _gcry_aes_ppc8_setkey; ctx->encrypt_fn = _gcry_aes_ppc9le_encrypt; ctx->decrypt_fn = _gcry_aes_ppc9le_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption; /* Setup PPC9LE bulk encryption routines. */ bulk_ops->cfb_enc = _gcry_aes_ppc9le_cfb_enc; bulk_ops->cfb_dec = _gcry_aes_ppc9le_cfb_dec; bulk_ops->cbc_enc = _gcry_aes_ppc9le_cbc_enc; bulk_ops->cbc_dec = _gcry_aes_ppc9le_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_ppc9le_ctr_enc; bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt; if (hwfeatures & HWF_PPC_ARCH_3_10) /* for P10 */ bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt; +# ifdef ENABLE_FORCE_SOFT_HWFEATURES + /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. + * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ + if (hwfeatures & HWF_PPC_ARCH_3_00) + bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt; +# endif } #endif #ifdef USE_PPC_CRYPTO else if (hwfeatures & HWF_PPC_VCRYPTO) { hw_setkey = _gcry_aes_ppc8_setkey; ctx->encrypt_fn = _gcry_aes_ppc8_encrypt; ctx->decrypt_fn = _gcry_aes_ppc8_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption; /* Setup PPC8 bulk encryption routines. */ bulk_ops->cfb_enc = _gcry_aes_ppc8_cfb_enc; bulk_ops->cfb_dec = _gcry_aes_ppc8_cfb_dec; bulk_ops->cbc_enc = _gcry_aes_ppc8_cbc_enc; bulk_ops->cbc_dec = _gcry_aes_ppc8_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_ppc8_ctr_enc; bulk_ops->ocb_crypt = _gcry_aes_ppc8_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_ppc8_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_ppc8_xts_crypt; } #endif #ifdef USE_S390X_CRYPTO else if (_gcry_aes_s390x_setup_acceleration (ctx, keylen, hwfeatures, bulk_ops)) { hw_setkey = _gcry_aes_s390x_setkey; ctx->encrypt_fn = _gcry_aes_s390x_encrypt; ctx->decrypt_fn = _gcry_aes_s390x_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->prepare_decryption = _gcry_aes_s390x_prepare_decryption; } #endif else { ctx->encrypt_fn = do_encrypt; ctx->decrypt_fn = do_decrypt; ctx->prefetch_enc_fn = prefetch_enc; ctx->prefetch_dec_fn = prefetch_dec; ctx->prepare_decryption = prepare_decryption; } /* NB: We don't yet support Padlock hardware key generation. */ if (hw_setkey) { hw_setkey (ctx, key); } else { const byte *sbox = ((const byte *)encT) + 1; union { PROPERLY_ALIGNED_TYPE dummy; byte data[MAXKC][4]; u32 data32[MAXKC]; } tkk[2]; #define k tkk[0].data #define k_u32 tkk[0].data32 #define tk tkk[1].data #define tk_u32 tkk[1].data32 #define W (ctx->keyschenc) #define W_u32 (ctx->keyschenc32) prefetch_enc(); for (i = 0; i < keylen; i++) { k[i >> 2][i & 3] = key[i]; } for (j = KC-1; j >= 0; j--) { tk_u32[j] = k_u32[j]; } r = 0; t = 0; /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } while (r < rounds + 1) { /* While not enough round key material calculated calculate new values. */ tk[0][0] ^= sbox[tk[KC-1][1] * 4]; tk[0][1] ^= sbox[tk[KC-1][2] * 4]; tk[0][2] ^= sbox[tk[KC-1][3] * 4]; tk[0][3] ^= sbox[tk[KC-1][0] * 4]; tk[0][0] ^= rcon[rconpointer++]; if (KC != 8) { for (j = 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } else { for (j = 1; j < KC/2; j++) { tk_u32[j] ^= tk_u32[j-1]; } tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4]; tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4]; tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4]; tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4]; for (j = KC/2 + 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } } #undef W #undef tk #undef k #undef W_u32 #undef tk_u32 #undef k_u32 wipememory(&tkk, sizeof(tkk)); } return 0; } static gcry_err_code_t rijndael_setkey (void *context, const byte *key, const unsigned keylen, cipher_bulk_ops_t *bulk_ops) { RIJNDAEL_context *ctx = context; return do_setkey (ctx, key, keylen, bulk_ops); } /* Make a decryption key from an encryption key. */ static void prepare_decryption( RIJNDAEL_context *ctx ) { const byte *sbox = ((const byte *)encT) + 1; int r; prefetch_enc(); prefetch_dec(); ctx->keyschdec32[0][0] = ctx->keyschenc32[0][0]; ctx->keyschdec32[0][1] = ctx->keyschenc32[0][1]; ctx->keyschdec32[0][2] = ctx->keyschenc32[0][2]; ctx->keyschdec32[0][3] = ctx->keyschenc32[0][3]; for (r = 1; r < ctx->rounds; r++) { u32 *wi = ctx->keyschenc32[r]; u32 *wo = ctx->keyschdec32[r]; u32 wt; wt = wi[0]; wo[0] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[1]; wo[1] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[2]; wo[2] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[3]; wo[3] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); } ctx->keyschdec32[r][0] = ctx->keyschenc32[r][0]; ctx->keyschdec32[r][1] = ctx->keyschenc32[r][1]; ctx->keyschdec32[r][2] = ctx->keyschenc32[r][2]; ctx->keyschdec32[r][3] = ctx->keyschenc32[r][3]; } #if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) /* Encrypt one block. A and B may be the same. */ static unsigned int do_encrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { #define rk (ctx->keyschenc32) const byte *sbox = ((const byte *)encT) + 1; int rounds = ctx->rounds; int r; u32 sa[4]; u32 sb[4]; sb[0] = buf_get_le32(a + 0); sb[1] = buf_get_le32(a + 4); sb[2] = buf_get_le32(a + 8); sb[3] = buf_get_le32(a + 12); sa[0] = sb[0] ^ rk[0][0]; sa[1] = sb[1] ^ rk[0][1]; sa[2] = sb[2] ^ rk[0][2]; sa[3] = sb[3] ^ rk[0][3]; sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[1][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[1][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[1][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[1][3] ^ sb[3]; for (r = 2; r < rounds; r++) { sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; r++; sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; } /* Last round is special. */ sb[0] = ((u32)sbox[(byte)(sa[0] >> (0 * 8)) * 4]) << (0 * 8); sb[3] = ((u32)sbox[(byte)(sa[0] >> (1 * 8)) * 4]) << (1 * 8); sb[2] = ((u32)sbox[(byte)(sa[0] >> (2 * 8)) * 4]) << (2 * 8); sb[1] = ((u32)sbox[(byte)(sa[0] >> (3 * 8)) * 4]) << (3 * 8); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= ((u32)sbox[(byte)(sa[1] >> (0 * 8)) * 4]) << (0 * 8); sa[0] ^= ((u32)sbox[(byte)(sa[1] >> (1 * 8)) * 4]) << (1 * 8); sb[3] ^= ((u32)sbox[(byte)(sa[1] >> (2 * 8)) * 4]) << (2 * 8); sb[2] ^= ((u32)sbox[(byte)(sa[1] >> (3 * 8)) * 4]) << (3 * 8); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= ((u32)sbox[(byte)(sa[2] >> (0 * 8)) * 4]) << (0 * 8); sa[1] ^= ((u32)sbox[(byte)(sa[2] >> (1 * 8)) * 4]) << (1 * 8); sa[0] ^= ((u32)sbox[(byte)(sa[2] >> (2 * 8)) * 4]) << (2 * 8); sb[3] ^= ((u32)sbox[(byte)(sa[2] >> (3 * 8)) * 4]) << (3 * 8); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= ((u32)sbox[(byte)(sa[3] >> (0 * 8)) * 4]) << (0 * 8); sa[2] ^= ((u32)sbox[(byte)(sa[3] >> (1 * 8)) * 4]) << (1 * 8); sa[1] ^= ((u32)sbox[(byte)(sa[3] >> (2 * 8)) * 4]) << (2 * 8); sa[0] ^= ((u32)sbox[(byte)(sa[3] >> (3 * 8)) * 4]) << (3 * 8); sa[3] = rk[r][3] ^ sb[3]; buf_put_le32(b + 0, sa[0]); buf_put_le32(b + 4, sa[1]); buf_put_le32(b + 8, sa[2]); buf_put_le32(b + 12, sa[3]); #undef rk return (56 + 2*sizeof(int)); } #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, enc_tables.T); #elif defined(USE_ARM_ASM) return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, enc_tables.T); #else return do_encrypt_fn (ctx, bx, ax); #endif /* !USE_ARM_ASM && !USE_AMD64_ASM*/ } static unsigned int rijndael_encrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); return ctx->encrypt_fn (ctx, b, a); } /* Bulk encryption of complete blocks in CFB mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_aes_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); for ( ;nblocks; nblocks-- ) { /* Encrypt the IV. */ burn_depth = encrypt_fn (ctx, iv, iv); /* XOR the input with the IV and store input into IV. */ cipher_block_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption of complete blocks in CBC mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_aes_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char *last_iv; unsigned int burn_depth = 0; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); last_iv = iv; for ( ;nblocks; nblocks-- ) { cipher_block_xor(outbuf, inbuf, last_iv, BLOCKSIZE); burn_depth = encrypt_fn (ctx, outbuf, outbuf); last_iv = outbuf; inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } if (last_iv != iv) cipher_block_cpy (iv, last_iv, BLOCKSIZE); if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption of complete blocks in CTR mode. Caller needs to make sure that CTR is aligned on a 16 byte boundary if AESNI; the minimum alignment is for an u32. This function is only intended for the bulk encryption feature of cipher.c. CTR is expected to be of size BLOCKSIZE. */ static void _gcry_aes_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); for ( ;nblocks; nblocks-- ) { /* Encrypt the counter. */ burn_depth = encrypt_fn (ctx, tmp.x1, ctr); /* XOR the input with the encrypted counter and store in output. */ cipher_block_xor(outbuf, tmp.x1, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; /* Increment the counter. */ cipher_block_add(ctr, 1, BLOCKSIZE); } wipememory(&tmp, sizeof(tmp)); if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } #if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) /* Decrypt one block. A and B may be the same. */ static unsigned int do_decrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { #define rk (ctx->keyschdec32) int rounds = ctx->rounds; int r; u32 sa[4]; u32 sb[4]; sb[0] = buf_get_le32(a + 0); sb[1] = buf_get_le32(a + 4); sb[2] = buf_get_le32(a + 8); sb[3] = buf_get_le32(a + 12); sa[0] = sb[0] ^ rk[rounds][0]; sa[1] = sb[1] ^ rk[rounds][1]; sa[2] = sb[2] ^ rk[rounds][2]; sa[3] = sb[3] ^ rk[rounds][3]; for (r = rounds - 1; r > 1; r--) { sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; r--; sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; } sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[1][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[1][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[1][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[1][3] ^ sb[3]; /* Last round is special. */ sb[0] = (u32)inv_sbox[(byte)(sa[0] >> (0 * 8))] << (0 * 8); sb[1] = (u32)inv_sbox[(byte)(sa[0] >> (1 * 8))] << (1 * 8); sb[2] = (u32)inv_sbox[(byte)(sa[0] >> (2 * 8))] << (2 * 8); sb[3] = (u32)inv_sbox[(byte)(sa[0] >> (3 * 8))] << (3 * 8); sa[0] = sb[0] ^ rk[0][0]; sb[1] ^= (u32)inv_sbox[(byte)(sa[1] >> (0 * 8))] << (0 * 8); sb[2] ^= (u32)inv_sbox[(byte)(sa[1] >> (1 * 8))] << (1 * 8); sb[3] ^= (u32)inv_sbox[(byte)(sa[1] >> (2 * 8))] << (2 * 8); sa[0] ^= (u32)inv_sbox[(byte)(sa[1] >> (3 * 8))] << (3 * 8); sa[1] = sb[1] ^ rk[0][1]; sb[2] ^= (u32)inv_sbox[(byte)(sa[2] >> (0 * 8))] << (0 * 8); sb[3] ^= (u32)inv_sbox[(byte)(sa[2] >> (1 * 8))] << (1 * 8); sa[0] ^= (u32)inv_sbox[(byte)(sa[2] >> (2 * 8))] << (2 * 8); sa[1] ^= (u32)inv_sbox[(byte)(sa[2] >> (3 * 8))] << (3 * 8); sa[2] = sb[2] ^ rk[0][2]; sb[3] ^= (u32)inv_sbox[(byte)(sa[3] >> (0 * 8))] << (0 * 8); sa[0] ^= (u32)inv_sbox[(byte)(sa[3] >> (1 * 8))] << (1 * 8); sa[1] ^= (u32)inv_sbox[(byte)(sa[3] >> (2 * 8))] << (2 * 8); sa[2] ^= (u32)inv_sbox[(byte)(sa[3] >> (3 * 8))] << (3 * 8); sa[3] = sb[3] ^ rk[0][3]; buf_put_le32(b + 0, sa[0]); buf_put_le32(b + 4, sa[1]); buf_put_le32(b + 8, sa[2]); buf_put_le32(b + 12, sa[3]); #undef rk return (56+2*sizeof(int)); } #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ /* Decrypt one block. AX and BX may be the same. */ static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, dec_tables.T); #elif defined(USE_ARM_ASM) return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, dec_tables.T); #else return do_decrypt_fn (ctx, bx, ax); #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ } static inline void check_decryption_preparation (RIJNDAEL_context *ctx) { if ( !ctx->decryption_prepared ) { ctx->prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } } static unsigned int rijndael_decrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); return ctx->decrypt_fn (ctx, b, a); } /* Bulk decryption of complete blocks in CFB mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_aes_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); for ( ;nblocks; nblocks-- ) { burn_depth = encrypt_fn (ctx, iv, iv); cipher_block_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk decryption of complete blocks in CBC mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ static void _gcry_aes_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16; rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn; check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); for ( ;nblocks; nblocks-- ) { /* INBUF is needed later and it may be identical to OUTBUF, so store the intermediate result to SAVEBUF. */ burn_depth = decrypt_fn (ctx, savebuf, inbuf); cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } wipememory(savebuf, sizeof(savebuf)); if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption/decryption of complete blocks in OCB mode. */ static size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; if (encrypt) { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_iv.iv, l, BLOCKSIZE); cipher_block_cpy (l_tmp.x1, inbuf, BLOCKSIZE); /* Checksum_i = Checksum_{i-1} xor P_i */ cipher_block_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1); cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); cipher_block_cpy (outbuf, l_tmp.x1, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } } else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn; check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_iv.iv, l, BLOCKSIZE); cipher_block_cpy (l_tmp.x1, inbuf, BLOCKSIZE); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); burn_depth = decrypt_fn (ctx, l_tmp.x1, l_tmp.x1); cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); /* Checksum_i = Checksum_{i-1} xor P_i */ cipher_block_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE); cipher_block_cpy (outbuf, l_tmp.x1, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); return 0; } /* Bulk authentication of complete blocks in OCB mode. */ static size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; unsigned int burn_depth = 0; union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.aad_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l, BLOCKSIZE); /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ cipher_block_xor (l_tmp.x1, c->u_mode.ocb.aad_offset, abuf, BLOCKSIZE); burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1); cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp.x1, BLOCKSIZE); abuf += BLOCKSIZE; } wipememory(&l_tmp, sizeof(l_tmp)); if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); return 0; } /* Bulk encryption/decryption of complete blocks in XTS mode. */ static void _gcry_aes_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; rijndael_cryptfn_t crypt_fn; u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry; if (encrypt) { if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); crypt_fn = ctx->encrypt_fn; } else { check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); crypt_fn = ctx->decrypt_fn; } tweak_next_lo = buf_get_le64 (tweak + 0); tweak_next_hi = buf_get_le64 (tweak + 8); while (nblocks) { tweak_lo = tweak_next_lo; tweak_hi = tweak_next_hi; /* Xor-Encrypt/Decrypt-Xor block. */ tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo; tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi; buf_put_le64 (outbuf + 0, tmp_lo); buf_put_le64 (outbuf + 8, tmp_hi); /* Generate next tweak. */ carry = -(tweak_next_hi >> 63) & 0x87; tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63); tweak_next_lo = (tweak_next_lo << 1) ^ carry; burn_depth = crypt_fn (ctx, outbuf, outbuf); buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo); buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi); outbuf += GCRY_XTS_BLOCK_LEN; inbuf += GCRY_XTS_BLOCK_LEN; nblocks--; } buf_put_le64 (tweak + 0, tweak_next_lo); buf_put_le64 (tweak + 8, tweak_next_hi); if (burn_depth) _gcry_burn_stack (burn_depth + 5 * sizeof(void *)); } /* Run the self-tests for AES 128. Returns NULL on success. */ static const char* selftest_basic_128 (void) { RIJNDAEL_context *ctx; unsigned char ctxmem[sizeof(*ctx) + 16]; unsigned char scratch[16]; cipher_bulk_ops_t bulk_ops; /* The test vectors are from the AES supplied ones; more or less randomly taken from ecb_tbl.txt (I=42,81,14) */ #if 1 static const unsigned char plaintext_128[16] = { 0x01,0x4B,0xAF,0x22,0x78,0xA6,0x9D,0x33, 0x1D,0x51,0x80,0x10,0x36,0x43,0xE9,0x9A }; static const unsigned char key_128[16] = { 0xE8,0xE9,0xEA,0xEB,0xED,0xEE,0xEF,0xF0, 0xF2,0xF3,0xF4,0xF5,0xF7,0xF8,0xF9,0xFA }; static const unsigned char ciphertext_128[16] = { 0x67,0x43,0xC3,0xD1,0x51,0x9A,0xB4,0xF2, 0xCD,0x9A,0x78,0xAB,0x09,0xA5,0x11,0xBD }; #else /* Test vectors from fips-197, appendix C. */ # warning debug test vectors in use static const unsigned char plaintext_128[16] = { 0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77, 0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff }; static const unsigned char key_128[16] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f /* 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, */ /* 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c */ }; static const unsigned char ciphertext_128[16] = { 0x69,0xc4,0xe0,0xd8,0x6a,0x7b,0x04,0x30, 0xd8,0xcd,0xb7,0x80,0x70,0xb4,0xc5,0x5a }; #endif ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15)); rijndael_setkey (ctx, key_128, sizeof (key_128), &bulk_ops); rijndael_encrypt (ctx, scratch, plaintext_128); if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128))) { return "AES-128 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); if (memcmp (scratch, plaintext_128, sizeof (plaintext_128))) return "AES-128 test decryption failed."; return NULL; } /* Run the self-tests for AES 192. Returns NULL on success. */ static const char* selftest_basic_192 (void) { RIJNDAEL_context *ctx; unsigned char ctxmem[sizeof(*ctx) + 16]; unsigned char scratch[16]; cipher_bulk_ops_t bulk_ops; static unsigned char plaintext_192[16] = { 0x76,0x77,0x74,0x75,0xF1,0xF2,0xF3,0xF4, 0xF8,0xF9,0xE6,0xE7,0x77,0x70,0x71,0x72 }; static unsigned char key_192[24] = { 0x04,0x05,0x06,0x07,0x09,0x0A,0x0B,0x0C, 0x0E,0x0F,0x10,0x11,0x13,0x14,0x15,0x16, 0x18,0x19,0x1A,0x1B,0x1D,0x1E,0x1F,0x20 }; static const unsigned char ciphertext_192[16] = { 0x5D,0x1E,0xF2,0x0D,0xCE,0xD6,0xBC,0xBC, 0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA }; ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15)); rijndael_setkey (ctx, key_192, sizeof(key_192), &bulk_ops); rijndael_encrypt (ctx, scratch, plaintext_192); if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192))) { return "AES-192 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); if (memcmp (scratch, plaintext_192, sizeof (plaintext_192))) return "AES-192 test decryption failed."; return NULL; } /* Run the self-tests for AES 256. Returns NULL on success. */ static const char* selftest_basic_256 (void) { RIJNDAEL_context *ctx; unsigned char ctxmem[sizeof(*ctx) + 16]; unsigned char scratch[16]; cipher_bulk_ops_t bulk_ops; static unsigned char plaintext_256[16] = { 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F, 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21 }; static unsigned char key_256[32] = { 0x08,0x09,0x0A,0x0B,0x0D,0x0E,0x0F,0x10, 0x12,0x13,0x14,0x15,0x17,0x18,0x19,0x1A, 0x1C,0x1D,0x1E,0x1F,0x21,0x22,0x23,0x24, 0x26,0x27,0x28,0x29,0x2B,0x2C,0x2D,0x2E }; static const unsigned char ciphertext_256[16] = { 0x08,0x0E,0x95,0x17,0xEB,0x16,0x77,0x71, 0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3 }; ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15)); rijndael_setkey (ctx, key_256, sizeof(key_256), &bulk_ops); rijndael_encrypt (ctx, scratch, plaintext_256); if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256))) { return "AES-256 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); if (memcmp (scratch, plaintext_256, sizeof (plaintext_256))) return "AES-256 test decryption failed."; return NULL; } /* Run all the self-tests and return NULL on success. This function is used for the on-the-fly self-tests. */ static const char * selftest (void) { const char *r; if ( (r = selftest_basic_128 ()) || (r = selftest_basic_192 ()) || (r = selftest_basic_256 ()) ) return r; return r; } /* SP800-38a.pdf for AES-128. */ static const char * selftest_fips_128_38a (int requested_mode) { static const struct tv { int mode; const unsigned char key[16]; const unsigned char iv[16]; struct { const unsigned char input[16]; const unsigned char output[16]; } data[4]; } tv[2] = { { GCRY_CIPHER_MODE_CFB, /* F.3.13, CFB128-AES128 */ { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a }, { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20, 0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } }, { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0xc8, 0xa6, 0x45, 0x37, 0xa0, 0xb3, 0xa9, 0x3f, 0xcd, 0xe3, 0xcd, 0xad, 0x9f, 0x1c, 0xe5, 0x8b } }, { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x26, 0x75, 0x1f, 0x67, 0xa3, 0xcb, 0xb1, 0x40, 0xb1, 0x80, 0x8c, 0xf1, 0x87, 0xa4, 0xf4, 0xdf } }, { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 }, { 0xc0, 0x4b, 0x05, 0x35, 0x7c, 0x5d, 0x1c, 0x0e, 0xea, 0xc4, 0xc6, 0x6f, 0x9f, 0xf7, 0xf2, 0xe6 } } } }, { GCRY_CIPHER_MODE_OFB, { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a }, { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20, 0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } }, { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0x77, 0x89, 0x50, 0x8d, 0x16, 0x91, 0x8f, 0x03, 0xf5, 0x3c, 0x52, 0xda, 0xc5, 0x4e, 0xd8, 0x25 } }, { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x97, 0x40, 0x05, 0x1e, 0x9c, 0x5f, 0xec, 0xf6, 0x43, 0x44, 0xf7, 0xa8, 0x22, 0x60, 0xed, 0xcc } }, { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 }, { 0x30, 0x4c, 0x65, 0x28, 0xf6, 0x59, 0xc7, 0x78, 0x66, 0xa5, 0x10, 0xd9, 0xc1, 0xd6, 0xae, 0x5e } }, } } }; unsigned char scratch[16]; gpg_error_t err; int tvi, idx; gcry_cipher_hd_t hdenc = NULL; gcry_cipher_hd_t hddec = NULL; #define Fail(a) do { \ _gcry_cipher_close (hdenc); \ _gcry_cipher_close (hddec); \ return a; \ } while (0) gcry_assert (sizeof tv[0].data[0].input == sizeof scratch); gcry_assert (sizeof tv[0].data[0].output == sizeof scratch); for (tvi=0; tvi < DIM (tv); tvi++) if (tv[tvi].mode == requested_mode) break; if (tvi == DIM (tv)) Fail ("no test data for this mode"); err = _gcry_cipher_open (&hdenc, GCRY_CIPHER_AES, tv[tvi].mode, 0); if (err) Fail ("open"); err = _gcry_cipher_open (&hddec, GCRY_CIPHER_AES, tv[tvi].mode, 0); if (err) Fail ("open"); err = _gcry_cipher_setkey (hdenc, tv[tvi].key, sizeof tv[tvi].key); if (!err) err = _gcry_cipher_setkey (hddec, tv[tvi].key, sizeof tv[tvi].key); if (err) Fail ("set key"); err = _gcry_cipher_setiv (hdenc, tv[tvi].iv, sizeof tv[tvi].iv); if (!err) err = _gcry_cipher_setiv (hddec, tv[tvi].iv, sizeof tv[tvi].iv); if (err) Fail ("set IV"); for (idx=0; idx < DIM (tv[tvi].data); idx++) { err = _gcry_cipher_encrypt (hdenc, scratch, sizeof scratch, tv[tvi].data[idx].input, sizeof tv[tvi].data[idx].input); if (err) Fail ("encrypt command"); if (memcmp (scratch, tv[tvi].data[idx].output, sizeof scratch)) Fail ("encrypt mismatch"); err = _gcry_cipher_decrypt (hddec, scratch, sizeof scratch, tv[tvi].data[idx].output, sizeof tv[tvi].data[idx].output); if (err) Fail ("decrypt command"); if (memcmp (scratch, tv[tvi].data[idx].input, sizeof scratch)) Fail ("decrypt mismatch"); } #undef Fail _gcry_cipher_close (hdenc); _gcry_cipher_close (hddec); return NULL; } /* Complete selftest for AES-128 with all modes and driver code. */ static gpg_err_code_t selftest_fips_128 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "low-level"; errtxt = selftest_basic_128 (); if (errtxt) goto failed; if (extended) { what = "cfb"; errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_CFB); if (errtxt) goto failed; what = "ofb"; errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_OFB); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES128, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Complete selftest for AES-192. */ static gpg_err_code_t selftest_fips_192 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; (void)extended; /* No extended tests available. */ what = "low-level"; errtxt = selftest_basic_192 (); if (errtxt) goto failed; return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES192, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Complete selftest for AES-256. */ static gpg_err_code_t selftest_fips_256 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; (void)extended; /* No extended tests available. */ what = "low-level"; errtxt = selftest_basic_256 (); if (errtxt) goto failed; return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES256, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t run_selftests (int algo, int extended, selftest_report_func_t report) { gpg_err_code_t ec; switch (algo) { case GCRY_CIPHER_AES128: ec = selftest_fips_128 (extended, report); break; case GCRY_CIPHER_AES192: ec = selftest_fips_192 (extended, report); break; case GCRY_CIPHER_AES256: ec = selftest_fips_256 (extended, report); break; default: ec = GPG_ERR_CIPHER_ALGO; break; } return ec; } static const char *rijndael_names[] = { "RIJNDAEL", "AES128", "AES-128", NULL }; static const gcry_cipher_oid_spec_t rijndael_oids[] = { { "2.16.840.1.101.3.4.1.1", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.2", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.3", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.4", GCRY_CIPHER_MODE_CFB }, { "2.16.840.1.101.3.4.1.6", GCRY_CIPHER_MODE_GCM }, { "2.16.840.1.101.3.4.1.7", GCRY_CIPHER_MODE_CCM }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes = { GCRY_CIPHER_AES, {0, 1}, "AES", rijndael_names, rijndael_oids, 16, 128, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests }; static const char *rijndael192_names[] = { "RIJNDAEL192", "AES-192", NULL }; static const gcry_cipher_oid_spec_t rijndael192_oids[] = { { "2.16.840.1.101.3.4.1.21", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.22", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.23", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.24", GCRY_CIPHER_MODE_CFB }, { "2.16.840.1.101.3.4.1.26", GCRY_CIPHER_MODE_GCM }, { "2.16.840.1.101.3.4.1.27", GCRY_CIPHER_MODE_CCM }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes192 = { GCRY_CIPHER_AES192, {0, 1}, "AES192", rijndael192_names, rijndael192_oids, 16, 192, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests }; static const char *rijndael256_names[] = { "RIJNDAEL256", "AES-256", NULL }; static const gcry_cipher_oid_spec_t rijndael256_oids[] = { { "2.16.840.1.101.3.4.1.41", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.42", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.43", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.44", GCRY_CIPHER_MODE_CFB }, { "2.16.840.1.101.3.4.1.46", GCRY_CIPHER_MODE_GCM }, { "2.16.840.1.101.3.4.1.47", GCRY_CIPHER_MODE_CCM }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes256 = { GCRY_CIPHER_AES256, {0, 1}, "AES256", rijndael256_names, rijndael256_oids, 16, 256, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests };