diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 50a0745b..e7e61ca8 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -1,3301 +1,3327 @@ /* AES-NI accelerated AES for Libgcrypt * Copyright (C) 2000, 2001, 2002, 2003, 2007, * 2008, 2011, 2012 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_AESNI #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */ /* Prevent compiler from issuing SSE instructions between asm blocks. */ # pragma GCC target("no-sse") #endif typedef struct u128_s { u32 a, b, c, d; } __attribute__((packed, aligned(1), may_alias)) u128_t; /* Two macros to be called prior and after the use of AESNI instructions. There should be no external function calls between the use of these macros. There purpose is to make sure that the SSE regsiters are cleared and won't reveal any information about the key or the data. */ #ifdef __WIN64__ /* XMM6-XMM15 are callee-saved registers on WIN64. */ # define aesni_prepare_2_6_variable char win64tmp[16] # define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9] # define aesni_prepare() do { } while (0) # define aesni_prepare_2_6() \ do { asm volatile ("movdqu %%xmm6, %0\n\t" \ : "=m" (*win64tmp) \ : \ : "memory"); \ } while (0) # define aesni_prepare_7_15() \ do { asm volatile ("movdqu %%xmm7, 0*16(%0)\n\t" \ "movdqu %%xmm8, 1*16(%0)\n\t" \ "movdqu %%xmm9, 2*16(%0)\n\t" \ "movdqu %%xmm10, 3*16(%0)\n\t" \ "movdqu %%xmm11, 4*16(%0)\n\t" \ "movdqu %%xmm12, 5*16(%0)\n\t" \ "movdqu %%xmm13, 6*16(%0)\n\t" \ "movdqu %%xmm14, 7*16(%0)\n\t" \ "movdqu %%xmm15, 8*16(%0)\n\t" \ : \ : "r" (win64tmp7_15) \ : "memory"); \ } while (0) # define aesni_cleanup() \ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ "pxor %%xmm1, %%xmm1\n" :: ); \ } while (0) # define aesni_cleanup_2_6() \ do { asm volatile ("movdqu %0, %%xmm6\n\t" \ "pxor %%xmm2, %%xmm2\n" \ "pxor %%xmm3, %%xmm3\n" \ "pxor %%xmm4, %%xmm4\n" \ "pxor %%xmm5, %%xmm5\n" \ : \ : "m" (*win64tmp) \ : "memory"); \ } while (0) # define aesni_cleanup_7_15() \ do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t" \ "movdqu 1*16(%0), %%xmm8\n\t" \ "movdqu 2*16(%0), %%xmm9\n\t" \ "movdqu 3*16(%0), %%xmm10\n\t" \ "movdqu 4*16(%0), %%xmm11\n\t" \ "movdqu 5*16(%0), %%xmm12\n\t" \ "movdqu 6*16(%0), %%xmm13\n\t" \ "movdqu 7*16(%0), %%xmm14\n\t" \ "movdqu 8*16(%0), %%xmm15\n\t" \ : \ : "r" (win64tmp7_15) \ : "memory"); \ } while (0) #else # define aesni_prepare_2_6_variable # define aesni_prepare() do { } while (0) # define aesni_prepare_2_6() do { } while (0) # define aesni_cleanup() \ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ "pxor %%xmm1, %%xmm1\n" :: ); \ } while (0) # define aesni_cleanup_2_6() \ do { asm volatile ("pxor %%xmm2, %%xmm2\n\t" \ "pxor %%xmm3, %%xmm3\n" \ "pxor %%xmm4, %%xmm4\n" \ "pxor %%xmm5, %%xmm5\n" \ "pxor %%xmm6, %%xmm6\n":: ); \ } while (0) # ifdef __x86_64__ # define aesni_prepare_7_15_variable # define aesni_prepare_7_15() do { } while (0) # define aesni_cleanup_7_15() \ do { asm volatile ("pxor %%xmm7, %%xmm7\n\t" \ "pxor %%xmm8, %%xmm8\n" \ "pxor %%xmm9, %%xmm9\n" \ "pxor %%xmm10, %%xmm10\n" \ "pxor %%xmm11, %%xmm11\n" \ "pxor %%xmm12, %%xmm12\n" \ "pxor %%xmm13, %%xmm13\n" \ "pxor %%xmm14, %%xmm14\n" \ "pxor %%xmm15, %%xmm15\n":: ); \ } while (0) # endif #endif void _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key) { aesni_prepare_2_6_variable; aesni_prepare(); aesni_prepare_2_6(); if (ctx->rounds < 12) { /* 128-bit key */ #define AESKEYGENASSIST_xmm1_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" #define AESKEY_EXPAND128 \ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm1, %%xmm3\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm1\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm1\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm2\n\t" \ "pxor %%xmm2, %%xmm1\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x01) AESKEY_EXPAND128 "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x02) AESKEY_EXPAND128 "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x04) AESKEY_EXPAND128 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x08) AESKEY_EXPAND128 "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x10) AESKEY_EXPAND128 "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x20) AESKEY_EXPAND128 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x40) AESKEY_EXPAND128 "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x80) AESKEY_EXPAND128 "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x1b) AESKEY_EXPAND128 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x36) AESKEY_EXPAND128 "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm1_xmm2 #undef AESKEY_EXPAND128 } else if (ctx->rounds == 12) { /* 192-bit key */ #define AESKEYGENASSIST_xmm3_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" #define AESKEY_EXPAND192 \ "pshufd $0x55, %%xmm2, %%xmm2\n\t" \ "movdqu %%xmm1, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pxor %%xmm2, %%xmm1\n\t" \ "pshufd $0xff, %%xmm1, %%xmm2\n\t" \ "movdqu %%xmm3, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pxor %%xmm2, %%xmm3\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ "movq 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..23] */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x01) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x02) AESKEY_EXPAND192 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x04) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x08) AESKEY_EXPAND192 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x10) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x20) AESKEY_EXPAND192 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x40) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x80) AESKEY_EXPAND192 "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm3_xmm2 #undef AESKEY_EXPAND192 } else if (ctx->rounds > 12) { /* 256-bit key */ #define AESKEYGENASSIST_xmm1_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" #define AESKEYGENASSIST_xmm3_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" #define AESKEY_EXPAND256_A \ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm1, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pxor %%xmm2, %%xmm1\n\t" #define AESKEY_EXPAND256_B \ "pshufd $0xaa, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm3, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pxor %%xmm2, %%xmm3\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ "movdqu 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..31] */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x01) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x02) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x04) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x08) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x10) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x20) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x40) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm1_xmm2 #undef AESKEYGENASSIST_xmm3_xmm2 #undef AESKEY_EXPAND256_A #undef AESKEY_EXPAND256_B } aesni_cleanup(); aesni_cleanup_2_6(); } /* Make a decryption key from an encryption key. */ -void -_gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx) +static inline void +do_aesni_prepare_decryption (RIJNDAEL_context *ctx) { /* The AES-NI decrypt instructions use the Equivalent Inverse Cipher, thus we can't use the the standard decrypt key preparation. */ u128_t *ekey = (u128_t *)ctx->keyschenc; u128_t *dkey = (u128_t *)ctx->keyschdec; int rr; int r; - aesni_prepare(); - #define DO_AESNI_AESIMC() \ asm volatile ("movdqa %[ekey], %%xmm1\n\t" \ /*"aesimc %%xmm1, %%xmm1\n\t"*/ \ ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \ "movdqa %%xmm1, %[dkey]" \ : [dkey] "=m" (dkey[r]) \ : [ekey] "m" (ekey[rr]) \ : "memory") dkey[0] = ekey[ctx->rounds]; r=1; rr=ctx->rounds-1; DO_AESNI_AESIMC(); r++; rr--; /* round 1 */ DO_AESNI_AESIMC(); r++; rr--; /* round 2 */ DO_AESNI_AESIMC(); r++; rr--; /* round 3 */ DO_AESNI_AESIMC(); r++; rr--; /* round 4 */ DO_AESNI_AESIMC(); r++; rr--; /* round 5 */ DO_AESNI_AESIMC(); r++; rr--; /* round 6 */ DO_AESNI_AESIMC(); r++; rr--; /* round 7 */ DO_AESNI_AESIMC(); r++; rr--; /* round 8 */ DO_AESNI_AESIMC(); r++; rr--; /* round 9 */ if (ctx->rounds > 10) { DO_AESNI_AESIMC(); r++; rr--; /* round 10 */ DO_AESNI_AESIMC(); r++; rr--; /* round 11 */ if (ctx->rounds > 12) { DO_AESNI_AESIMC(); r++; rr--; /* round 12 */ DO_AESNI_AESIMC(); r++; rr--; /* round 13 */ } } dkey[r] = ekey[0]; #undef DO_AESNI_AESIMC +} +void +_gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx) +{ + aesni_prepare(); + do_aesni_prepare_decryption (ctx); aesni_cleanup(); } /* Encrypt one block using the Intel AES-NI instructions. Block is input * and output through SSE register xmm0. */ static inline void do_aesni_enc (const RIJNDAEL_context *ctx) { #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" asm volatile ("movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 "\n" : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm1_xmm0 #undef aesenclast_xmm1_xmm0 } /* Decrypt one block using the Intel AES-NI instructions. Block is input * and output through SSE register xmm0. */ static inline void do_aesni_dec (const RIJNDAEL_context *ctx) { #define aesdec_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t" #define aesdeclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t" asm volatile ("movdqa (%[key]), %%xmm1\n\t" "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Ldeclast%=:\n\t" aesdeclast_xmm1_xmm0 "\n" : : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesdec_xmm1_xmm0 #undef aesdeclast_xmm1_xmm0 } /* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4. */ static inline void do_aesni_enc_vec4 (const RIJNDAEL_context *ctx) { #define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t" #define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t" #define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t" #define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t" #define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t" #define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t" #define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t" #define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t" asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x20(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x30(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x40(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x50(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x60(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x70(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x80(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x90(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xa0(%[key]), %%xmm0\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xb0(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xc0(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xd0(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" aesenclast_xmm0_xmm1 aesenclast_xmm0_xmm2 aesenclast_xmm0_xmm3 aesenclast_xmm0_xmm4 : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm0_xmm1 #undef aesenc_xmm0_xmm2 #undef aesenc_xmm0_xmm3 #undef aesenc_xmm0_xmm4 #undef aesenclast_xmm0_xmm1 #undef aesenclast_xmm0_xmm2 #undef aesenclast_xmm0_xmm3 #undef aesenclast_xmm0_xmm4 } /* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4. */ static inline void do_aesni_dec_vec4 (const RIJNDAEL_context *ctx) { #define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t" #define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t" #define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t" #define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t" #define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t" #define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t" #define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t" #define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t" asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x20(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x30(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x40(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x50(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x60(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x70(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x80(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x90(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xa0(%[key]), %%xmm0\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xb0(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xc0(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xd0(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" aesdeclast_xmm0_xmm1 aesdeclast_xmm0_xmm2 aesdeclast_xmm0_xmm3 aesdeclast_xmm0_xmm4 : /* no output */ : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesdec_xmm0_xmm1 #undef aesdec_xmm0_xmm2 #undef aesdec_xmm0_xmm3 #undef aesdec_xmm0_xmm4 #undef aesdeclast_xmm0_xmm1 #undef aesdeclast_xmm0_xmm2 #undef aesdeclast_xmm0_xmm3 #undef aesdeclast_xmm0_xmm4 } #ifdef __x86_64__ /* Encrypt eight blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ static inline void do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) { asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "jb .Ldeclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "je .Ldeclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" "aesenclast %%xmm0, %%xmm1\n\t" "aesenclast %%xmm0, %%xmm2\n\t" "aesenclast %%xmm0, %%xmm3\n\t" "aesenclast %%xmm0, %%xmm4\n\t" "aesenclast %%xmm0, %%xmm8\n\t" "aesenclast %%xmm0, %%xmm9\n\t" "aesenclast %%xmm0, %%xmm10\n\t" "aesenclast %%xmm0, %%xmm11\n\t" : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); } /* Decrypt eight blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ static inline void do_aesni_dec_vec8 (const RIJNDAEL_context *ctx) { asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "jb .Ldeclast%=\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "je .Ldeclast%=\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" "aesdeclast %%xmm0, %%xmm1\n\t" "aesdeclast %%xmm0, %%xmm2\n\t" "aesdeclast %%xmm0, %%xmm3\n\t" "aesdeclast %%xmm0, %%xmm4\n\t" "aesdeclast %%xmm0, %%xmm8\n\t" "aesdeclast %%xmm0, %%xmm9\n\t" "aesdeclast %%xmm0, %%xmm10\n\t" "aesdeclast %%xmm0, %%xmm11\n\t" : /* no output */ : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); } #endif /* __x86_64__ */ /* Perform a CTR encryption round using the counter CTR and the input block A. Write the result to the output block B and update CTR. CTR needs to be a 16 byte aligned little-endian value. */ static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm5\n\t" "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ (big endian) */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "cmpl $0xffffffff, 12(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "psubq %%xmm1, %%xmm5\n\t" /* add carry to upper 64bits */ ".Lno_carry%=:\n\t" "pshufb %%xmm6, %%xmm5\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ "pxor (%[key]), %%xmm0\n\t" /* xmm1 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 "movdqu %[src], %%xmm1\n\t" /* xmm1 := input */ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */ "movdqu %%xmm0, %[dst]" /* Store EncCTR. */ : [dst] "=m" (*b) : [src] "m" (*a), [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [rounds] "g" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm1_xmm0 #undef aesenclast_xmm1_xmm0 } /* Four blocks at a time variant of do_aesni_ctr. */ static void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { static const byte bige_addb_const[4][16] __attribute__ ((aligned (16))) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 } }; const void *bige_addb = bige_addb_const; #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenc_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t" #define aesenc_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t" #define aesenc_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" #define aesenclast_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t" #define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t" #define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t" /* Register usage: [key] keyschedule xmm0 CTR-0 xmm1 temp / round key xmm2 CTR-1 xmm3 CTR-2 xmm4 CTR-3 xmm5 copy of *ctr xmm6 endian swapping mask */ asm volatile (/* detect if 8-bit carry handling is needed */ "cmpb $0xfb, 15(%[ctr])\n\t" "ja .Ladd32bit%=\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */ "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */ "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */ "movdqa 3*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(4) */ "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */ "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */ "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */ "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(4) + CTR (xmm0) */ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "jmp .Lstore_ctr%=\n\t" ".Ladd32bit%=:\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ "movdqa %%xmm0, %%xmm2\n\t" "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ "movdqa %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "movl 12(%[ctr]), %%esi\n\t" "bswapl %%esi\n\t" "cmpl $0xfffffffc, %%esi\n\t" "jb .Lno_carry%=\n\t" /* no carry */ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */ "cmpl $0xfffffffe, %%esi\n\t" "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ /* esi == 0xffffffff */ "psubq %%xmm1, %%xmm2\n\t" ".Lcarry_xmm3%=:\n\t" "psubq %%xmm1, %%xmm3\n\t" ".Lcarry_xmm4%=:\n\t" "psubq %%xmm1, %%xmm4\n\t" ".Lcarry_xmm5%=:\n\t" "psubq %%xmm1, %%xmm5\n\t" ".Lno_carry%=:\n\t" "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ ".Lstore_ctr%=:\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ : : [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [addb] "r" (bige_addb) : "%esi", "cc", "memory"); asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 aesenclast_xmm1_xmm2 aesenclast_xmm1_xmm3 aesenclast_xmm1_xmm4 : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); asm volatile ("movdqu (%[src]), %%xmm1\n\t" /* Get block 1. */ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */ "movdqu %%xmm0, (%[dst])\n\t" /* Store block 1 */ "movdqu 16(%[src]), %%xmm1\n\t" /* Get block 2. */ "pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */ "movdqu %%xmm2, 16(%[dst])\n\t" /* Store block 2. */ "movdqu 32(%[src]), %%xmm1\n\t" /* Get block 3. */ "pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */ "movdqu %%xmm3, 32(%[dst])\n\t" /* Store block 3. */ "movdqu 48(%[src]), %%xmm1\n\t" /* Get block 4. */ "pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */ "movdqu %%xmm4, 48(%[dst])" /* Store block 4. */ : : [src] "r" (a), [dst] "r" (b) : "memory"); #undef aesenc_xmm1_xmm0 #undef aesenc_xmm1_xmm2 #undef aesenc_xmm1_xmm3 #undef aesenc_xmm1_xmm4 #undef aesenclast_xmm1_xmm0 #undef aesenclast_xmm1_xmm2 #undef aesenclast_xmm1_xmm3 #undef aesenclast_xmm1_xmm4 } #ifdef __x86_64__ /* Eight blocks at a time variant of do_aesni_ctr. */ static void do_aesni_ctr_8 (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 } }; const void *bige_addb = bige_addb_const; /* Register usage: [key] keyschedule xmm0 CTR-0 xmm1 temp / round key xmm2 CTR-1 xmm3 CTR-2 xmm4 CTR-3 xmm5 copy of *ctr xmm6 endian swapping mask xmm8 CTR-4 xmm9 CTR-5 xmm10 CTR-6 xmm11 CTR-7 xmm12 temp xmm13 temp xmm14 temp xmm15 temp */ asm volatile (/* detect if 8-bit carry handling is needed */ "cmpb $0xf7, 15(%[ctr])\n\t" "ja .Ladd32bit%=\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */ "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */ "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */ "movdqa 3*16(%[addb]), %%xmm8\n\t" /* xmm8 := be(4) */ "movdqa 4*16(%[addb]), %%xmm9\n\t" /* xmm9 := be(5) */ "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */ "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */ "movdqa 7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) */ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */ "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */ "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */ "paddb %%xmm0, %%xmm8\n\t" /* xmm8 := be(4) + CTR (xmm0) */ "paddb %%xmm0, %%xmm9\n\t" /* xmm9 := be(5) + CTR (xmm0) */ "paddb %%xmm0, %%xmm10\n\t" /* xmm10 := be(6) + CTR (xmm0) */ "paddb %%xmm0, %%xmm11\n\t" /* xmm11 := be(7) + CTR (xmm0) */ "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(8) + CTR (xmm0) */ "jmp .Lstore_ctr%=\n\t" ".Ladd32bit%=:\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ "movdqa %%xmm0, %%xmm2\n\t" "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ "movdqa %%xmm4, %%xmm8\n\t" /* xmm8 := xmm4 */ "psubq %%xmm1, %%xmm8\n\t" /* xmm8++ */ "movdqa %%xmm8, %%xmm9\n\t" /* xmm9 := xmm8 */ "psubq %%xmm1, %%xmm9\n\t" /* xmm9++ */ "movdqa %%xmm9, %%xmm10\n\t" /* xmm10 := xmm9 */ "psubq %%xmm1, %%xmm10\n\t" /* xmm10++ */ "movdqa %%xmm10, %%xmm11\n\t" /* xmm11 := xmm10 */ "psubq %%xmm1, %%xmm11\n\t" /* xmm11++ */ "movdqa %%xmm11, %%xmm5\n\t" /* xmm5 := xmm11 */ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "movl 12(%[ctr]), %%esi\n\t" "bswapl %%esi\n\t" "cmpl $0xfffffff8, %%esi\n\t" "jb .Lno_carry%=\n\t" /* no carry */ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffff8 */ "cmpl $0xfffffffa, %%esi\n\t" "jb .Lcarry_xmm11%=\n\t" /* esi == 0xfffffff9 */ "je .Lcarry_xmm10%=\n\t" /* esi == 0xfffffffa */ "cmpl $0xfffffffc, %%esi\n\t" "jb .Lcarry_xmm9%=\n\t" /* esi == 0xfffffffb */ "je .Lcarry_xmm8%=\n\t" /* esi == 0xfffffffc */ "cmpl $0xfffffffe, %%esi\n\t" "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ /* esi == 0xffffffff */ "psubq %%xmm1, %%xmm2\n\t" ".Lcarry_xmm3%=:\n\t" "psubq %%xmm1, %%xmm3\n\t" ".Lcarry_xmm4%=:\n\t" "psubq %%xmm1, %%xmm4\n\t" ".Lcarry_xmm8%=:\n\t" "psubq %%xmm1, %%xmm8\n\t" ".Lcarry_xmm9%=:\n\t" "psubq %%xmm1, %%xmm9\n\t" ".Lcarry_xmm10%=:\n\t" "psubq %%xmm1, %%xmm10\n\t" ".Lcarry_xmm11%=:\n\t" "psubq %%xmm1, %%xmm11\n\t" ".Lcarry_xmm5%=:\n\t" "psubq %%xmm1, %%xmm5\n\t" ".Lno_carry%=:\n\t" "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ "pshufb %%xmm6, %%xmm8\n\t" /* xmm8 := be(xmm8) */ "pshufb %%xmm6, %%xmm9\n\t" /* xmm9 := be(xmm9) */ "pshufb %%xmm6, %%xmm10\n\t" /* xmm10 := be(xmm10) */ "pshufb %%xmm6, %%xmm11\n\t" /* xmm11 := be(xmm11) */ ".Lstore_ctr%=:\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ : : [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [addb] "r" (bige_addb) : "%esi", "cc", "memory"); asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm1\n\t" "jb .Lenclast%=\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm1\n\t" "je .Lenclast%=\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" "aesenclast %%xmm1, %%xmm0\n\t" "aesenclast %%xmm1, %%xmm2\n\t" "aesenclast %%xmm1, %%xmm3\n\t" "aesenclast %%xmm1, %%xmm4\n\t" "aesenclast %%xmm1, %%xmm8\n\t" "aesenclast %%xmm1, %%xmm9\n\t" "aesenclast %%xmm1, %%xmm10\n\t" "aesenclast %%xmm1, %%xmm11\n\t" : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1. */ "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2. */ "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3. */ "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4. */ "movdqu 4*16(%[src]), %%xmm1\n\t" /* Get block 5. */ "pxor %%xmm12, %%xmm0\n\t" /* EncCTR-1 ^= input */ "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6. */ "pxor %%xmm13, %%xmm2\n\t" /* EncCTR-2 ^= input */ "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7. */ "pxor %%xmm14, %%xmm3\n\t" /* EncCTR-3 ^= input */ "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8. */ "pxor %%xmm15, %%xmm4\n\t" /* EncCTR-4 ^= input */ "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */ "pxor %%xmm1, %%xmm8\n\t" /* EncCTR-5 ^= input */ "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */ "pxor %%xmm12, %%xmm9\n\t" /* EncCTR-6 ^= input */ "movdqu %%xmm2, 1*16(%[dst])\n\t" /* Store block 2. */ "pxor %%xmm13, %%xmm10\n\t" /* EncCTR-7 ^= input */ "movdqu %%xmm3, 2*16(%[dst])\n\t" /* Store block 3. */ "pxor %%xmm14, %%xmm11\n\t" /* EncCTR-8 ^= input */ "movdqu %%xmm4, 3*16(%[dst])\n\t" /* Store block 4. */ "movdqu %%xmm8, 4*16(%[dst])\n\t" /* Store block 8. */ "movdqu %%xmm9, 5*16(%[dst])\n\t" /* Store block 9. */ "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10. */ "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11. */ : : [src] "r" (a), [dst] "r" (b) : "memory"); } #endif /* __x86_64__ */ unsigned int _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { aesni_prepare (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_aesni_enc (ctx); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); aesni_cleanup (); return 0; } void -_gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare (); asm volatile ("movdqu %[iv], %%xmm0\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { do_aesni_enc (ctx); asm volatile ("movdqu %[inbuf], %%xmm1\n\t" "pxor %%xmm1, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); } void -_gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int cbc_mac) { aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6(); asm volatile ("movdqu %[iv], %%xmm5\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" : /* No output */ : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("movdqa %%xmm0, %%xmm5\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_6 (); } void -_gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *ctr, +_gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6(); asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */ "movdqa %[ctr], %%xmm5\n\t" /* Preload CTR */ : /* No output */ : [mask] "m" (*be_mask), [ctr] "m" (*ctr) : "memory"); #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_7_15_variable; aesni_prepare_7_15(); for ( ;nblocks >= 8 ; nblocks -= 8 ) { do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_7_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { do_aesni_ctr (ctx, ctr, outbuf, inbuf); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } aesni_cleanup (); aesni_cleanup_2_6 (); } unsigned int _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { aesni_prepare (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_aesni_dec (ctx); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); aesni_cleanup (); return 0; } void -_gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6(); asm volatile ("movdqu %[iv], %%xmm6\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); /* CFB decryption can be parallelized */ #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_7_15_variable; aesni_prepare_7_15(); for ( ;nblocks >= 8; nblocks -= 8) { asm volatile ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */ "movdqu 0*16(%[inbuf]), %%xmm2\n\t" "movdqu 1*16(%[inbuf]), %%xmm3\n\t" "movdqu 2*16(%[inbuf]), %%xmm4\n\t" "movdqu 3*16(%[inbuf]), %%xmm8\n\t" "movdqu 4*16(%[inbuf]), %%xmm9\n\t" "movdqu 5*16(%[inbuf]), %%xmm10\n\t" "movdqu 6*16(%[inbuf]), %%xmm11\n\t" "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */ "movdqa %%xmm2, %%xmm12\n\t" "movdqa %%xmm3, %%xmm13\n\t" "movdqa %%xmm4, %%xmm14\n\t" "movdqa %%xmm8, %%xmm15\n\t" : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_enc_vec8 (ctx); asm volatile ( "pxor %%xmm12, %%xmm1\n\t" "movdqu 4*16(%[inbuf]), %%xmm12\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqu 5*16(%[inbuf]), %%xmm13\n\t" "pxor %%xmm14, %%xmm3\n\t" "movdqu 6*16(%[inbuf]), %%xmm14\n\t" "pxor %%xmm15, %%xmm4\n\t" "movdqu 7*16(%[inbuf]), %%xmm15\n\t" "pxor %%xmm12, %%xmm8\n\t" "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "pxor %%xmm13, %%xmm9\n\t" "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "pxor %%xmm14, %%xmm10\n\t" "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "pxor %%xmm15, %%xmm11\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" "movdqu %%xmm8, 4*16(%[outbuf])\n\t" "movdqu %%xmm9, 5*16(%[outbuf])\n\t" "movdqu %%xmm10, 6*16(%[outbuf])\n\t" "movdqu %%xmm11, 7*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_7_15(); } #endif for ( ;nblocks >= 4; nblocks -= 4) { asm volatile ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */ "movdqu 0*16(%[inbuf]), %%xmm2\n\t" "movdqu 1*16(%[inbuf]), %%xmm3\n\t" "movdqu 2*16(%[inbuf]), %%xmm4\n\t" "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */ : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu 0*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "movdqu 1*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "movdqu 2*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "movdqu 3*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc"); for ( ;nblocks; nblocks-- ) { do_aesni_enc (ctx); asm volatile ("movdqa %%xmm0, %%xmm6\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" "movdqu %%xmm6, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_6 (); } void -_gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, - size_t nblocks) +_gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, + size_t nblocks) { aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6(); + if ( !ctx->decryption_prepared ) + { + do_aesni_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + asm volatile ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */ : /* No output */ : [iv] "m" (*iv) : "memory"); #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_7_15_variable; aesni_prepare_7_15(); for ( ;nblocks >= 8 ; nblocks -= 8 ) { asm volatile ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ "movdqu 1*16(%[inbuf]), %%xmm2\n\t" "movdqu 2*16(%[inbuf]), %%xmm3\n\t" "movdqu 3*16(%[inbuf]), %%xmm4\n\t" "movdqu 4*16(%[inbuf]), %%xmm8\n\t" "movdqu 5*16(%[inbuf]), %%xmm9\n\t" "movdqu 6*16(%[inbuf]), %%xmm10\n\t" "movdqu 7*16(%[inbuf]), %%xmm11\n\t" "movdqa %%xmm1, %%xmm12\n\t" "movdqa %%xmm2, %%xmm13\n\t" "movdqa %%xmm3, %%xmm14\n\t" "movdqa %%xmm4, %%xmm15\n\t" : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_dec_vec8 (ctx); asm volatile ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */ "pxor %%xmm12, %%xmm2\n\t" /* xor IV with output */ "movdqu 4*16(%[inbuf]), %%xmm12\n\t" "pxor %%xmm13, %%xmm3\n\t" /* xor IV with output */ "movdqu 5*16(%[inbuf]), %%xmm13\n\t" "pxor %%xmm14, %%xmm4\n\t" /* xor IV with output */ "movdqu 6*16(%[inbuf]), %%xmm14\n\t" "pxor %%xmm15, %%xmm8\n\t" /* xor IV with output */ "movdqu 7*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm12, %%xmm9\n\t" /* xor IV with output */ "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "pxor %%xmm13, %%xmm10\n\t" /* xor IV with output */ "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "pxor %%xmm14, %%xmm11\n\t" /* xor IV with output */ "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" "movdqu %%xmm8, 4*16(%[outbuf])\n\t" "movdqu %%xmm9, 5*16(%[outbuf])\n\t" "movdqu %%xmm10, 6*16(%[outbuf])\n\t" "movdqu %%xmm11, 7*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_7_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { asm volatile ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ "movdqu 1*16(%[inbuf]), %%xmm2\n\t" "movdqu 2*16(%[inbuf]), %%xmm3\n\t" "movdqu 3*16(%[inbuf]), %%xmm4\n\t" : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_dec_vec4 (ctx); asm volatile ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */ "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */ "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */ "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */ "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm4, 3*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "movdqa %%xmm0, %%xmm2\n\t" /* use xmm2 as savebuf */ : /* No output */ : [inbuf] "m" (*inbuf) : "memory"); /* uses only xmm0 and xmm1 */ do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */ "movdqu %%xmm0, %[outbuf]\n\t" "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */ : [outbuf] "=m" (*outbuf) : : "memory"); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[iv]\n\t" /* store IV */ : /* No output */ : [iv] "m" (*iv) : "memory"); aesni_cleanup (); aesni_cleanup_2_6 (); } static void aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; const unsigned char *l; aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6 (); /* Preload Offset and Checksum */ asm volatile ("movdqu %[iv], %%xmm5\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv), [ctr] "m" (*c->u_ctr.ctr) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm0, %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_7_15_variable; aesni_prepare_7_15(); asm volatile ("movdqu %[l0], %%xmm7\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l1], %%xmm10\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqa %%xmm5, %%xmm12\n\t" : : [l1] "m" (*c->u_mode.ocb.L[1]), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm10, %%xmm5\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqa %%xmm5, %%xmm13\n\t" : : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqa %%xmm5, %%xmm14\n\t" : : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l3], %%xmm15\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm15, %%xmm5\n\t" "pxor %%xmm4, %%xmm6\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqa %%xmm5, %%xmm15\n\t" : : [l3] "m" (*l), [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); n += 4; l = ocb_get_l(c, n); asm volatile ("movdqu %[inbuf4], %%xmm8\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm8, %%xmm6\n\t" "pxor %%xmm5, %%xmm8\n\t" "movdqu %%xmm5, %[outbuf4]\n\t" : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)) : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf5], %%xmm9\n\t" "pxor %%xmm10, %%xmm5\n\t" "pxor %%xmm9, %%xmm6\n\t" "pxor %%xmm5, %%xmm9\n\t" "movdqu %%xmm5, %[outbuf5]\n\t" : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)) : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm10, %%xmm6\n\t" "pxor %%xmm5, %%xmm10\n\t" "movdqu %%xmm5, %[outbuf6]\n\t" : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)) : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l7], %%xmm11\n\t" "pxor %%xmm11, %%xmm5\n\t" "movdqu %[inbuf7], %%xmm11\n\t" "pxor %%xmm11, %%xmm6\n\t" "pxor %%xmm5, %%xmm11\n\t" : : [l7] "m" (*l), [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec8 (ctx); asm volatile ("pxor %%xmm12, %%xmm1\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqu %[outbuf4],%%xmm0\n\t" "movdqu %[outbuf5],%%xmm12\n\t" "movdqu %[outbuf6],%%xmm13\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm15, %%xmm4\n\t" "pxor %%xmm0, %%xmm8\n\t" "pxor %%xmm12, %%xmm9\n\t" "pxor %%xmm13, %%xmm10\n\t" "pxor %%xmm5, %%xmm11\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" "movdqu %%xmm8, %[outbuf4]\n\t" "movdqu %%xmm9, %[outbuf5]\n\t" "movdqu %%xmm10, %[outbuf6]\n\t" "movdqu %%xmm11, %[outbuf7]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)), [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)), [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)), [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) : : "memory" ); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_7_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) : [l0] "m" (*c->u_mode.ocb.L[0]), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l1], %%xmm0\n\t" "movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) : [l1] "m" (*c->u_mode.ocb.L[1]), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l3], %%xmm4\n\t" "pxor %%xmm4, %%xmm5\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm4, %%xmm6\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [l3] "m" (*l), [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf1],%%xmm0\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %[outbuf2],%%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) : : "memory" ); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm0, %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_iv.iv), [ctr] "=m" (*c->u_ctr.ctr) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_6 (); } static void aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; const unsigned char *l; aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6 (); + if ( !ctx->decryption_prepared ) + { + do_aesni_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + /* Preload Offset and Checksum */ asm volatile ("movdqu %[iv], %%xmm5\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv), [ctr] "m" (*c->u_ctr.ctr) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_7_15_variable; aesni_prepare_7_15(); asm volatile ("movdqu %[l0], %%xmm7\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ asm volatile ("movdqu %[l1], %%xmm10\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqa %%xmm5, %%xmm12\n\t" : : [l1] "m" (*c->u_mode.ocb.L[1]), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm10, %%xmm5\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqa %%xmm5, %%xmm13\n\t" : : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqa %%xmm5, %%xmm14\n\t" : : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l3], %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqa %%xmm5, %%xmm15\n\t" : : [l3] "m" (*l), [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); n += 4; l = ocb_get_l(c, n); asm volatile ("movdqu %[inbuf4], %%xmm8\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm8\n\t" "movdqu %%xmm5, %[outbuf4]\n\t" : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)) : [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf5], %%xmm9\n\t" "pxor %%xmm10, %%xmm5\n\t" "pxor %%xmm5, %%xmm9\n\t" "movdqu %%xmm5, %[outbuf5]\n\t" : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)) : [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm10\n\t" "movdqu %%xmm5, %[outbuf6]\n\t" : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)) : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l7], %%xmm0\n\t" "movdqu %[inbuf7], %%xmm11\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm11\n\t" : : [l7] "m" (*l), [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)) : "memory" ); do_aesni_dec_vec8 (ctx); asm volatile ("pxor %%xmm12, %%xmm1\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqu %[outbuf4],%%xmm0\n\t" "movdqu %[outbuf5],%%xmm12\n\t" "movdqu %[outbuf6],%%xmm13\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm15, %%xmm4\n\t" "pxor %%xmm0, %%xmm8\n\t" "pxor %%xmm12, %%xmm9\n\t" "pxor %%xmm13, %%xmm10\n\t" "pxor %%xmm5, %%xmm11\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" "movdqu %%xmm8, %[outbuf4]\n\t" "movdqu %%xmm9, %[outbuf5]\n\t" "movdqu %%xmm10, %[outbuf6]\n\t" "movdqu %%xmm11, %[outbuf7]\n\t" "pxor %%xmm2, %%xmm1\n\t" "pxor %%xmm4, %%xmm1\n\t" "pxor %%xmm9, %%xmm1\n\t" "pxor %%xmm11, %%xmm1\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm8, %%xmm6\n\t" "pxor %%xmm10, %%xmm6\n\t" "pxor %%xmm1, %%xmm6\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)), [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)), [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)), [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) : : "memory" ); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_7_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ asm volatile ("movdqu %[l0], %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) : [l0] "m" (*c->u_mode.ocb.L[0]), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l1], %%xmm0\n\t" "movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) : [l1] "m" (*c->u_mode.ocb.L[1]), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l3], %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [l3] "m" (*l), [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_dec_vec4 (ctx); asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf1],%%xmm0\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %[outbuf2],%%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" "pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm4, %%xmm6\n\t" : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) : : "memory" ); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_iv.iv), [ctr] "=m" (*c->u_ctr.ctr) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_6 (); } -void +size_t _gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { if (encrypt) aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks); else aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks); + + return 0; } -void +size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; u64 n = c->u_mode.ocb.aad_nblocks; const unsigned char *l; aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6 (); /* Preload Offset and Sum */ asm volatile ("movdqu %[iv], %%xmm5\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_mode.ocb.aad_offset), [ctr] "m" (*c->u_mode.ocb.aad_sum) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[abuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm0, %%xmm6\n\t" : : : "memory" ); abuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_7_15_variable; aesni_prepare_7_15(); asm volatile ("movdqu %[l0], %%xmm7\n\t" "movdqu %[l1], %%xmm12\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l1] "m" (*c->u_mode.ocb.L[1]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[abuf0], %%xmm1\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" : : [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[abuf1], %%xmm2\n\t" "pxor %%xmm12, %%xmm5\n\t" "pxor %%xmm5, %%xmm2\n\t" : : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[abuf2], %%xmm3\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" : : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l3], %%xmm0\n\t" "movdqu %[abuf3], %%xmm4\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [l3] "m" (*l), [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) : "memory" ); n += 4; l = ocb_get_l(c, n); asm volatile ("movdqu %[abuf4], %%xmm8\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm8\n\t" : : [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[abuf5], %%xmm9\n\t" "pxor %%xmm12, %%xmm5\n\t" "pxor %%xmm5, %%xmm9\n\t" : : [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[abuf6], %%xmm10\n\t" "pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm5, %%xmm10\n\t" : : [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l7], %%xmm0\n\t" "movdqu %[abuf7], %%xmm11\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm11\n\t" : : [l7] "m" (*l), [abuf7] "m" (*(abuf + 7 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec8 (ctx); asm volatile ("pxor %%xmm2, %%xmm1\n\t" "pxor %%xmm3, %%xmm1\n\t" "pxor %%xmm4, %%xmm1\n\t" "pxor %%xmm8, %%xmm1\n\t" "pxor %%xmm9, %%xmm6\n\t" "pxor %%xmm10, %%xmm6\n\t" "pxor %%xmm11, %%xmm6\n\t" "pxor %%xmm1, %%xmm6\n\t" : : : "memory" ); abuf += 8*BLOCKSIZE; } aesni_cleanup_7_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm4\n\t" "movdqu %[abuf0], %%xmm1\n\t" "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l1], %%xmm0\n\t" "movdqu %[abuf1], %%xmm2\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm2\n\t" : : [l1] "m" (*c->u_mode.ocb.L[1]), [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[abuf2], %%xmm3\n\t" "pxor %%xmm4, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" : : [l2] "m" (*c->u_mode.ocb.L[0]), [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l3], %%xmm0\n\t" "movdqu %[abuf3], %%xmm4\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [l3] "m" (*l), [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm4, %%xmm6\n\t" : : : "memory" ); abuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[abuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm0, %%xmm6\n\t" : : : "memory" ); abuf += BLOCKSIZE; } c->u_mode.ocb.aad_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_mode.ocb.aad_offset), [ctr] "=m" (*c->u_mode.ocb.aad_sum) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_6 (); + + return 0; } static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) = { 0x87, 0x01 }; static void _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6 (); /* Preload Tweak */ asm volatile ("movdqu %[tweak], %%xmm5\n\t" "movdqa %[gfmul], %%xmm6\n\t" : : [tweak] "m" (*tweak), [gfmul] "m" (*xts_gfmul_const) : "memory" ); for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf0] "=m" (*(outbuf + 0 * 16)) : [inbuf0] "m" (*(inbuf + 0 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf1] "=m" (*(outbuf + 1 * 16)) : [inbuf1] "m" (*(inbuf + 1 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf2] "=m" (*(outbuf + 2 * 16)) : [inbuf2] "m" (*(inbuf + 2 * 16)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm5, %[outbuf3]\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf3] "=m" (*(outbuf + 3 * 16)) : [inbuf3] "m" (*(inbuf + 3 * 16)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu %[outbuf0], %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %[outbuf1], %%xmm0\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf2], %%xmm1\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %[outbuf3], %%xmm0\n\t" "pxor %%xmm1, %%xmm3\n\t" "pxor %%xmm0, %%xmm4\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * 16)), [outbuf1] "+m" (*(outbuf + 1 * 16)), [outbuf2] "+m" (*(outbuf + 2 * 16)), [outbuf3] "+m" (*(outbuf + 3 * 16)) : : "memory" ); outbuf += BLOCKSIZE * 4; inbuf += BLOCKSIZE * 4; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "movdqa %%xmm5, %%xmm4\n\t" "pshufd $0x13, %%xmm5, %%xmm1\n\t" "psrad $31, %%xmm1\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm1\n\t" "pxor %%xmm1, %%xmm5\n\t" : : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm4, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[tweak]\n\t" : [tweak] "=m" (*tweak) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_6 (); } static void _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_6_variable; aesni_prepare (); aesni_prepare_2_6 (); + if ( !ctx->decryption_prepared ) + { + do_aesni_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + /* Preload Tweak */ asm volatile ("movdqu %[tweak], %%xmm5\n\t" "movdqa %[gfmul], %%xmm6\n\t" : : [tweak] "m" (*tweak), [gfmul] "m" (*xts_gfmul_const) : "memory" ); for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf0] "=m" (*(outbuf + 0 * 16)) : [inbuf0] "m" (*(inbuf + 0 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf1] "=m" (*(outbuf + 1 * 16)) : [inbuf1] "m" (*(inbuf + 1 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf2] "=m" (*(outbuf + 2 * 16)) : [inbuf2] "m" (*(inbuf + 2 * 16)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm5, %[outbuf3]\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf3] "=m" (*(outbuf + 3 * 16)) : [inbuf3] "m" (*(inbuf + 3 * 16)) : "memory" ); do_aesni_dec_vec4 (ctx); asm volatile ("movdqu %[outbuf0], %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %[outbuf1], %%xmm0\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf2], %%xmm1\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %[outbuf3], %%xmm0\n\t" "pxor %%xmm1, %%xmm3\n\t" "pxor %%xmm0, %%xmm4\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * 16)), [outbuf1] "+m" (*(outbuf + 1 * 16)), [outbuf2] "+m" (*(outbuf + 2 * 16)), [outbuf3] "+m" (*(outbuf + 3 * 16)) : : "memory" ); outbuf += BLOCKSIZE * 4; inbuf += BLOCKSIZE * 4; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "movdqa %%xmm5, %%xmm4\n\t" "pshufd $0x13, %%xmm5, %%xmm1\n\t" "psrad $31, %%xmm1\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm1\n\t" "pxor %%xmm1, %%xmm5\n\t" : : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm4, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[tweak]\n\t" : [tweak] "=m" (*tweak) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_6 (); } void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int encrypt) { if (encrypt) _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks); else _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks); } #endif /* USE_AESNI */ diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c index 6af7108f..6e46830e 100644 --- a/cipher/rijndael-armv8-ce.c +++ b/cipher/rijndael-armv8-ce.c @@ -1,392 +1,414 @@ /* ARMv8 Crypto Extension AES for Libgcrypt * Copyright (C) 2016 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_ARM_CE typedef struct u128_s { u32 a, b, c, d; } u128_t; extern u32 _gcry_aes_sbox4_armv8_ce(u32 in4b); extern void _gcry_aes_invmixcol_armv8_ce(u128_t *dst, const u128_t *src); extern unsigned int _gcry_aes_enc_armv8_ce(const void *keysched, byte *dst, const byte *src, unsigned int nrounds); extern unsigned int _gcry_aes_dec_armv8_ce(const void *keysched, byte *dst, const byte *src, unsigned int nrounds); extern void _gcry_aes_cbc_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, int cbc_mac, unsigned int nrounds); extern void _gcry_aes_cbc_dec_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_cfb_dec_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *tweak, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *tweak, size_t nblocks, unsigned int nrounds); typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *tweak, size_t nblocks, unsigned int nrounds); void _gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key) { union { PROPERLY_ALIGNED_TYPE dummy; byte data[MAXKC][4]; u32 data32[MAXKC]; } tkk[2]; unsigned int rounds = ctx->rounds; int KC = rounds - 6; unsigned int keylen = KC * 4; unsigned int i, r, t; byte rcon = 1; int j; #define k tkk[0].data #define k_u32 tkk[0].data32 #define tk tkk[1].data #define tk_u32 tkk[1].data32 #define W (ctx->keyschenc) #define W_u32 (ctx->keyschenc32) for (i = 0; i < keylen; i++) { k[i >> 2][i & 3] = key[i]; } for (j = KC-1; j >= 0; j--) { tk_u32[j] = k_u32[j]; } r = 0; t = 0; /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } while (r < rounds + 1) { tk_u32[0] ^= _gcry_aes_sbox4_armv8_ce(rol(tk_u32[KC - 1], 24)) ^ rcon; if (KC != 8) { for (j = 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } else { for (j = 1; j < KC/2; j++) { tk_u32[j] ^= tk_u32[j-1]; } tk_u32[KC/2] ^= _gcry_aes_sbox4_armv8_ce(tk_u32[KC/2 - 1]); for (j = KC/2 + 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b); } #undef W #undef tk #undef k #undef W_u32 #undef tk_u32 #undef k_u32 wipememory(&tkk, sizeof(tkk)); } /* Make a decryption key from an encryption key. */ void _gcry_aes_armv8_ce_prepare_decryption (RIJNDAEL_context *ctx) { u128_t *ekey = (u128_t *)(void *)ctx->keyschenc; u128_t *dkey = (u128_t *)(void *)ctx->keyschdec; int rounds = ctx->rounds; int rr; int r; #define DO_AESIMC() _gcry_aes_invmixcol_armv8_ce(&dkey[r], &ekey[rr]) dkey[0] = ekey[rounds]; r = 1; rr = rounds-1; DO_AESIMC(); r++; rr--; /* round 1 */ DO_AESIMC(); r++; rr--; /* round 2 */ DO_AESIMC(); r++; rr--; /* round 3 */ DO_AESIMC(); r++; rr--; /* round 4 */ DO_AESIMC(); r++; rr--; /* round 5 */ DO_AESIMC(); r++; rr--; /* round 6 */ DO_AESIMC(); r++; rr--; /* round 7 */ DO_AESIMC(); r++; rr--; /* round 8 */ DO_AESIMC(); r++; rr--; /* round 9 */ if (rounds >= 12) { if (rounds > 12) { DO_AESIMC(); r++; rr--; /* round 10 */ DO_AESIMC(); r++; rr--; /* round 11 */ } DO_AESIMC(); r++; rr--; /* round 12 / 10 */ DO_AESIMC(); r++; rr--; /* round 13 / 11 */ } dkey[r] = ekey[0]; #undef DO_AESIMC } unsigned int _gcry_aes_armv8_ce_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; return _gcry_aes_enc_armv8_ce(keysched, dst, src, nrounds); } unsigned int _gcry_aes_armv8_ce_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { const void *keysched = ctx->keyschdec32; unsigned int nrounds = ctx->rounds; return _gcry_aes_dec_armv8_ce(keysched, dst, src, nrounds); } void -_gcry_aes_armv8_ce_cbc_enc (const RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_armv8_ce_cbc_enc (const RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int cbc_mac) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_cbc_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, cbc_mac, nrounds); } void -_gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { const void *keysched = ctx->keyschdec32; unsigned int nrounds = ctx->rounds; + if ( !ctx->decryption_prepared ) + { + _gcry_aes_armv8_ce_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + _gcry_aes_cbc_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } void -_gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_cfb_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } void -_gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_cfb_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } void -_gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, +_gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } -void +size_t _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = (void *)&c->context.c; const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce : _gcry_aes_ocb_dec_armv8_ce; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int nrounds = ctx->rounds; u64 blkn = c->u_mode.ocb.data_nblocks; + if ( !encrypt && !ctx->decryption_prepared ) + { + _gcry_aes_armv8_ce_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + c->u_mode.ocb.data_nblocks = blkn + nblocks; crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn); + + return 0; } -void +size_t _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const void *keysched = ctx->keyschenc32; const unsigned char *abuf = abuf_arg; unsigned int nrounds = ctx->rounds; u64 blkn = c->u_mode.ocb.aad_nblocks; c->u_mode.ocb.aad_nblocks = blkn + nblocks; _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn); + + return 0; } void _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int encrypt) { const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce : _gcry_aes_xts_dec_armv8_ce; unsigned int nrounds = ctx->rounds; + if ( !encrypt && !ctx->decryption_prepared ) + { + _gcry_aes_armv8_ce_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds); } #endif /* USE_ARM_CE */ diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c index 98660ecc..07a64a4c 100644 --- a/cipher/rijndael-ssse3-amd64.c +++ b/cipher/rijndael-ssse3-amd64.c @@ -1,688 +1,712 @@ /* SSSE3 vector permutation AES for Libgcrypt * Copyright (C) 2014-2017 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * * The code is based on the public domain library libvpaes version 0.5 * available at http://crypto.stanford.edu/vpaes/ and which carries * this notice: * * libvpaes: constant-time SSSE3 AES encryption and decryption. * version 0.5 * * By Mike Hamburg, Stanford University, 2009. Public domain. * I wrote essentially all of this code. I did not write the test * vectors; they are the NIST known answer tests. I hereby release all * the code and documentation here that I wrote into the public domain. * * This is an implementation of AES following my paper, * "Accelerating AES with Vector Permute Instructions" * CHES 2009; http://shiftleft.org/papers/vector_aes/ */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_SSSE3 #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */ /* Prevent compiler from issuing SSE instructions between asm blocks. */ # pragma GCC target("no-sse") #endif /* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these have custom calling convention (additional XMM parameters). */ extern void _gcry_aes_ssse3_enc_preload(void); extern void _gcry_aes_ssse3_dec_preload(void); extern void _gcry_aes_ssse3_schedule_core(const void *key, u64 keybits, void *buffer, u64 decrypt, u64 rotoffs); extern void _gcry_aes_ssse3_encrypt_core(const void *key, u64 nrounds); extern void _gcry_aes_ssse3_decrypt_core(const void *key, u64 nrounds); /* Two macros to be called prior and after the use of SSSE3 instructions. There should be no external function calls between the use of these macros. There purpose is to make sure that the SSE registers are cleared and won't reveal any information about the key or the data. */ #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define SSSE3_STATE_SIZE (16 * 10) /* XMM6-XMM15 are callee-saved registers on WIN64. */ # define vpaes_ssse3_prepare() \ asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" \ "movdqu %%xmm7, 1*16(%0)\n\t" \ "movdqu %%xmm8, 2*16(%0)\n\t" \ "movdqu %%xmm9, 3*16(%0)\n\t" \ "movdqu %%xmm10, 4*16(%0)\n\t" \ "movdqu %%xmm11, 5*16(%0)\n\t" \ "movdqu %%xmm12, 6*16(%0)\n\t" \ "movdqu %%xmm13, 7*16(%0)\n\t" \ "movdqu %%xmm14, 8*16(%0)\n\t" \ "movdqu %%xmm15, 9*16(%0)\n\t" \ : \ : "r" (ssse3_state) \ : "memory" ) # define vpaes_ssse3_cleanup() \ asm volatile ("pxor %%xmm0, %%xmm0 \n\t" \ "pxor %%xmm1, %%xmm1 \n\t" \ "pxor %%xmm2, %%xmm2 \n\t" \ "pxor %%xmm3, %%xmm3 \n\t" \ "pxor %%xmm4, %%xmm4 \n\t" \ "pxor %%xmm5, %%xmm5 \n\t" \ "movdqu 0*16(%0), %%xmm6 \n\t" \ "movdqu 1*16(%0), %%xmm7 \n\t" \ "movdqu 2*16(%0), %%xmm8 \n\t" \ "movdqu 3*16(%0), %%xmm9 \n\t" \ "movdqu 4*16(%0), %%xmm10 \n\t" \ "movdqu 5*16(%0), %%xmm11 \n\t" \ "movdqu 6*16(%0), %%xmm12 \n\t" \ "movdqu 7*16(%0), %%xmm13 \n\t" \ "movdqu 8*16(%0), %%xmm14 \n\t" \ "movdqu 9*16(%0), %%xmm15 \n\t" \ : \ : "r" (ssse3_state) \ : "memory" ) #else # define SSSE3_STATE_SIZE 1 # define vpaes_ssse3_prepare() (void)ssse3_state # define vpaes_ssse3_cleanup() \ asm volatile ("pxor %%xmm0, %%xmm0 \n\t" \ "pxor %%xmm1, %%xmm1 \n\t" \ "pxor %%xmm2, %%xmm2 \n\t" \ "pxor %%xmm3, %%xmm3 \n\t" \ "pxor %%xmm4, %%xmm4 \n\t" \ "pxor %%xmm5, %%xmm5 \n\t" \ "pxor %%xmm6, %%xmm6 \n\t" \ "pxor %%xmm7, %%xmm7 \n\t" \ "pxor %%xmm8, %%xmm8 \n\t" \ ::: "memory" ) #endif #define vpaes_ssse3_prepare_enc() \ vpaes_ssse3_prepare(); \ _gcry_aes_ssse3_enc_preload(); #define vpaes_ssse3_prepare_dec() \ vpaes_ssse3_prepare(); \ _gcry_aes_ssse3_dec_preload(); void _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key) { unsigned int keybits = (ctx->rounds - 10) * 32 + 128; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare(); _gcry_aes_ssse3_schedule_core(key, keybits, &ctx->keyschenc32[0][0], 0, 48); /* Save key for setting up decryption. */ if (keybits > 192) asm volatile ("movdqu (%[src]), %%xmm0\n\t" "movdqu 16(%[src]), %%xmm1\n\t" "movdqu %%xmm0, (%[dst])\n\t" "movdqu %%xmm1, 16(%[dst])\n\t" : /* No output */ : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key) : "memory" ); else if (keybits == 192) asm volatile ("movdqu (%[src]), %%xmm0\n\t" "movq 16(%[src]), %%xmm1\n\t" "movdqu %%xmm0, (%[dst])\n\t" "movq %%xmm1, 16(%[dst])\n\t" : /* No output */ : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key) : "memory" ); else asm volatile ("movdqu (%[src]), %%xmm0\n\t" "movdqu %%xmm0, (%[dst])\n\t" : /* No output */ : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key) : "memory" ); vpaes_ssse3_cleanup(); } /* Make a decryption key from an encryption key. */ -void -_gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx) +static inline void +do_ssse3_prepare_decryption (RIJNDAEL_context *ctx, + byte ssse3_state[SSSE3_STATE_SIZE]) { unsigned int keybits = (ctx->rounds - 10) * 32 + 128; - byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare(); _gcry_aes_ssse3_schedule_core(&ctx->keyschdec32[0][0], keybits, &ctx->keyschdec32[ctx->rounds][0], 1, (keybits == 192) ? 0 : 32); vpaes_ssse3_cleanup(); } +void +_gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx) +{ + byte ssse3_state[SSSE3_STATE_SIZE]; + + do_ssse3_prepare_decryption(ctx, ssse3_state); +} + /* Encrypt one block using the Intel SSSE3 instructions. Block is input * and output through SSE register xmm0. */ static inline void do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds) { _gcry_aes_ssse3_encrypt_core(ctx->keyschenc32, nrounds); } /* Decrypt one block using the Intel SSSE3 instructions. Block is input * and output through SSE register xmm0. */ static inline void do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds) { _gcry_aes_ssse3_decrypt_core(ctx->keyschdec32, nrounds); } unsigned int _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare_enc (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_vpaes_ssse3_enc (ctx, nrounds); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); vpaes_ssse3_cleanup (); return 0; } void -_gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, - size_t nblocks) +_gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, + size_t nblocks) { unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare_enc (); asm volatile ("movdqu %[iv], %%xmm0\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { do_vpaes_ssse3_enc (ctx, nrounds); asm volatile ("movdqu %[inbuf], %%xmm1\n\t" "pxor %%xmm1, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); vpaes_ssse3_cleanup (); } void -_gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, - size_t nblocks, int cbc_mac) +_gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, + size_t nblocks, int cbc_mac) { unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare_enc (); asm volatile ("movdqu %[iv], %%xmm7\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm7, %%xmm0\n\t" : /* No output */ : [inbuf] "m" (*inbuf) : "memory" ); do_vpaes_ssse3_enc (ctx, nrounds); asm volatile ("movdqa %%xmm0, %%xmm7\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm7, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); vpaes_ssse3_cleanup (); } void -_gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *ctr, - size_t nblocks) +_gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr, + unsigned char *outbuf, const unsigned char *inbuf, + size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; u64 ctrlow; vpaes_ssse3_prepare_enc (); asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */ "movdqa (%[ctr]), %%xmm7\n\t" /* Preload CTR */ "movq 8(%[ctr]), %q[ctrlow]\n\t" "bswapq %q[ctrlow]\n\t" : [ctrlow] "=r" (ctrlow) : [mask] "m" (*be_mask), [ctr] "r" (ctr) : "memory", "cc"); for ( ;nblocks; nblocks-- ) { asm volatile ("movdqa %%xmm7, %%xmm0\n\t" /* xmm0 := CTR (xmm7) */ "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm7\n\t" "psubq %%xmm1, %%xmm7\n\t" /* xmm7++ (big endian) */ /* detect if 64-bit carry handling is needed */ "incq %q[ctrlow]\n\t" "jnz .Lno_carry%=\n\t" "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "psubq %%xmm1, %%xmm7\n\t" /* add carry to upper 64bits */ ".Lno_carry%=:\n\t" "pshufb %%xmm6, %%xmm7\n\t" : [ctrlow] "+r" (ctrlow) : : "cc", "memory"); do_vpaes_ssse3_enc (ctx, nrounds); asm volatile ("movdqu %[src], %%xmm1\n\t" /* xmm1 := input */ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */ "movdqu %%xmm0, %[dst]" /* Store EncCTR. */ : [dst] "=m" (*outbuf) : [src] "m" (*inbuf) : "memory"); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm7, %[ctr]\n\t" /* Update CTR (mem). */ : [ctr] "=m" (*ctr) : : "memory" ); vpaes_ssse3_cleanup (); } unsigned int _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, - const unsigned char *src) + const unsigned char *src) { unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare_dec (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_vpaes_ssse3_dec (ctx, nrounds); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); vpaes_ssse3_cleanup (); return 0; } void -_gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, - size_t nblocks) +_gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, + size_t nblocks) { unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare_enc (); asm volatile ("movdqu %[iv], %%xmm0\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { do_vpaes_ssse3_enc (ctx, nrounds); asm volatile ("movdqa %%xmm0, %%xmm6\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" "movdqu %%xmm6, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); vpaes_ssse3_cleanup (); } void -_gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, - const unsigned char *inbuf, unsigned char *iv, - size_t nblocks) +_gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, const unsigned char *inbuf, + size_t nblocks) { unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; + if ( !ctx->decryption_prepared ) + { + do_ssse3_prepare_decryption ( ctx, ssse3_state ); + ctx->decryption_prepared = 1; + } + vpaes_ssse3_prepare_dec (); asm volatile ("movdqu %[iv], %%xmm7\n\t" /* use xmm7 as fast IV storage */ : /* No output */ : [iv] "m" (*iv) : "memory"); for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "movdqa %%xmm0, %%xmm6\n\t" /* use xmm6 as savebuf */ : /* No output */ : [inbuf] "m" (*inbuf) : "memory"); do_vpaes_ssse3_dec (ctx, nrounds); asm volatile ("pxor %%xmm7, %%xmm0\n\t" /* xor IV with output */ "movdqu %%xmm0, %[outbuf]\n\t" "movdqu %%xmm6, %%xmm7\n\t" /* store savebuf as new IV */ : [outbuf] "=m" (*outbuf) : : "memory"); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm7, %[iv]\n\t" /* store IV */ : /* No output */ : [iv] "m" (*iv) : "memory"); vpaes_ssse3_cleanup (); } static void ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare_enc (); /* Preload Offset and Checksum */ asm volatile ("movdqu %[iv], %%xmm7\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv), [ctr] "m" (*c->u_ctr.ctr) : "memory" ); for ( ;nblocks; nblocks-- ) { const unsigned char *l; l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm7\n\t" "pxor %%xmm0, %%xmm6\n\t" "pxor %%xmm7, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_vpaes_ssse3_enc (ctx, nrounds); asm volatile ("pxor %%xmm7, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm7, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_iv.iv), [ctr] "=m" (*c->u_ctr.ctr) : : "memory" ); vpaes_ssse3_cleanup (); } static void ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; + if ( !ctx->decryption_prepared ) + { + do_ssse3_prepare_decryption ( ctx, ssse3_state ); + ctx->decryption_prepared = 1; + } + vpaes_ssse3_prepare_dec (); /* Preload Offset and Checksum */ asm volatile ("movdqu %[iv], %%xmm7\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv), [ctr] "m" (*c->u_ctr.ctr) : "memory" ); for ( ;nblocks; nblocks-- ) { const unsigned char *l; l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm7\n\t" "pxor %%xmm7, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_vpaes_ssse3_dec (ctx, nrounds); asm volatile ("pxor %%xmm7, %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm7, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_iv.iv), [ctr] "=m" (*c->u_ctr.ctr) : : "memory" ); vpaes_ssse3_cleanup (); } -void +size_t _gcry_aes_ssse3_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { if (encrypt) ssse3_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks); else ssse3_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks); + + return 0; } -void +size_t _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; u64 n = c->u_mode.ocb.aad_nblocks; unsigned int nrounds = ctx->rounds; byte ssse3_state[SSSE3_STATE_SIZE]; vpaes_ssse3_prepare_enc (); /* Preload Offset and Sum */ asm volatile ("movdqu %[iv], %%xmm7\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_mode.ocb.aad_offset), [ctr] "m" (*c->u_mode.ocb.aad_sum) : "memory" ); for ( ;nblocks; nblocks-- ) { const unsigned char *l; l = ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[abuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm7\n\t" "pxor %%xmm7, %%xmm0\n\t" : : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); do_vpaes_ssse3_enc (ctx, nrounds); asm volatile ("pxor %%xmm0, %%xmm6\n\t" : : : "memory" ); abuf += BLOCKSIZE; } c->u_mode.ocb.aad_nblocks = n; asm volatile ("movdqu %%xmm7, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_mode.ocb.aad_offset), [ctr] "=m" (*c->u_mode.ocb.aad_sum) : : "memory" ); vpaes_ssse3_cleanup (); + + return 0; } #endif /* USE_SSSE3 */ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index f9666d0c..d3fcb76f 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -1,2082 +1,2084 @@ /* Rijndael (AES) for GnuPG * Copyright (C) 2000, 2001, 2002, 2003, 2007, * 2008, 2011, 2012 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . ******************************************************************* * The code here is based on the optimized implementation taken from * http://www.esat.kuleuven.ac.be/~rijmen/rijndael/ on Oct 2, 2000, * which carries this notice: *------------------------------------------ * rijndael-alg-fst.c v2.3 April '2000 * * Optimised ANSI C code * * authors: v1.0: Antoon Bosselaers * v2.0: Vincent Rijmen * v2.3: Paulo Barreto * * This code is placed in the public domain. *------------------------------------------ * * The SP800-38a document is available at: * http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf * */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_AMD64_ASM /* AMD64 assembly implementations of AES */ extern unsigned int _gcry_aes_amd64_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, int rounds, const void *encT); extern unsigned int _gcry_aes_amd64_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, int rounds, const void *decT); #endif /*USE_AMD64_ASM*/ #ifdef USE_AESNI /* AES-NI (AMD64 & i386) accelerated implementations of AES */ extern void _gcry_aes_aesni_do_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); -extern void _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks, - int cbc_mac); -extern void _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *ctr, size_t nblocks); -extern void _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, - const void *inbuf_arg, size_t nblocks, - int encrypt); -extern void _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, - size_t nblocks); -extern void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, - unsigned char *tweak, - unsigned char *outbuf, - const unsigned char *inbuf, - size_t nblocks, int encrypt); +extern void _gcry_aes_aesni_cfb_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_aesni_cbc_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int cbc_mac); +extern void _gcry_aes_aesni_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_aesni_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_aesni_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern size_t _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); +extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks); +extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); #endif #ifdef USE_SSSE3 /* SSSE3 (AMD64) vector permutation implementation of AES */ extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_ssse3_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); -extern void _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks, +extern void _gcry_aes_ssse3_cfb_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_ssse3_cbc_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int cbc_mac); -extern void _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *ctr, size_t nblocks); -extern void _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_ssse3_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, - const void *inbuf_arg, size_t nblocks, - int encrypt); -extern void _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, - size_t nblocks); +extern void _gcry_aes_ssse3_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_ssse3_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_ssse3_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern size_t _gcry_aes_ssse3_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); +extern size_t _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks); #endif #ifdef USE_PADLOCK extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); extern unsigned int _gcry_aes_padlock_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); #endif #ifdef USE_ARM_ASM /* ARM assembly implementations of AES */ extern unsigned int _gcry_aes_arm_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, int rounds, const void *encT); extern unsigned int _gcry_aes_arm_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, int rounds, const void *decT); #endif /*USE_ARM_ASM*/ #ifdef USE_ARM_CE /* ARMv8 Crypto Extension implementations of AES */ extern void _gcry_aes_armv8_ce_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_armv8_ce_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_armv8_ce_encrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_armv8_ce_decrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); -extern void _gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_armv8_ce_cbc_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks, +extern void _gcry_aes_armv8_ce_cfb_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_armv8_ce_cbc_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int cbc_mac); -extern void _gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *ctr, size_t nblocks); -extern void _gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *iv, size_t nblocks); -extern void _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, - const void *inbuf_arg, size_t nblocks, - int encrypt); -extern void _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, - const void *abuf_arg, size_t nblocks); -extern void _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, - unsigned char *tweak, - unsigned char *outbuf, - const unsigned char *inbuf, +extern void _gcry_aes_armv8_ce_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_armv8_ce_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern void _gcry_aes_armv8_ce_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); +extern size_t _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); +extern size_t _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, + const void *abuf_arg, size_t nblocks); +extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak, + void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt); #endif /*USE_ARM_ASM*/ static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); /* All the numbers. */ #include "rijndael-tables.h" /* Function prototypes. */ static const char *selftest(void); /* Prefetching for encryption/decryption tables. */ static void prefetch_table(const volatile byte *tab, size_t len) { size_t i; for (i = 0; i < len; i += 8 * 32) { (void)tab[i + 0 * 32]; (void)tab[i + 1 * 32]; (void)tab[i + 2 * 32]; (void)tab[i + 3 * 32]; (void)tab[i + 4 * 32]; (void)tab[i + 5 * 32]; (void)tab[i + 6 * 32]; (void)tab[i + 7 * 32]; } (void)tab[len - 1]; } static void prefetch_enc(void) { prefetch_table((const void *)encT, sizeof(encT)); } static void prefetch_dec(void) { prefetch_table((const void *)&dec_tables, sizeof(dec_tables)); } /* Perform the key setup. */ static gcry_err_code_t -do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) +do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, + gcry_cipher_hd_t hd) { static int initialized = 0; static const char *selftest_failed = 0; int rounds; int i,j, r, t, rconpointer = 0; int KC; #if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \ || defined(USE_ARM_CE) unsigned int hwfeatures; #endif /* The on-the-fly self tests are only run in non-fips mode. In fips mode explicit self-tests are required. Actually the on-the-fly self-tests are not fully thread-safe and it might happen that a failed self-test won't get noticed in another thread. FIXME: We might want to have a central registry of succeeded self-tests. */ if (!fips_mode () && !initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("%s\n", selftest_failed ); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if( keylen == 128/8 ) { rounds = 10; KC = 4; } else if ( keylen == 192/8 ) { rounds = 12; KC = 6; } else if ( keylen == 256/8 ) { rounds = 14; KC = 8; } else return GPG_ERR_INV_KEYLEN; ctx->rounds = rounds; #if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \ || defined(USE_ARM_CE) hwfeatures = _gcry_get_hw_features (); #endif ctx->decryption_prepared = 0; #ifdef USE_PADLOCK ctx->use_padlock = 0; #endif #ifdef USE_AESNI ctx->use_aesni = 0; #endif #ifdef USE_SSSE3 ctx->use_ssse3 = 0; #endif #ifdef USE_ARM_CE ctx->use_arm_ce = 0; #endif if (0) { ; } #ifdef USE_AESNI else if (hwfeatures & HWF_INTEL_AESNI) { ctx->encrypt_fn = _gcry_aes_aesni_encrypt; ctx->decrypt_fn = _gcry_aes_aesni_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_aesni = 1; + if (hd) + { + hd->bulk.cfb_enc = _gcry_aes_aesni_cfb_enc; + hd->bulk.cfb_dec = _gcry_aes_aesni_cfb_dec; + hd->bulk.cbc_enc = _gcry_aes_aesni_cbc_enc; + hd->bulk.cbc_dec = _gcry_aes_aesni_cbc_dec; + hd->bulk.ctr_enc = _gcry_aes_aesni_ctr_enc; + hd->bulk.ocb_crypt = _gcry_aes_aesni_ocb_crypt; + hd->bulk.ocb_auth = _gcry_aes_aesni_ocb_auth; + hd->bulk.xts_crypt = _gcry_aes_aesni_xts_crypt; + } } #endif #ifdef USE_PADLOCK else if (hwfeatures & HWF_PADLOCK_AES && keylen == 128/8) { ctx->encrypt_fn = _gcry_aes_padlock_encrypt; ctx->decrypt_fn = _gcry_aes_padlock_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_padlock = 1; memcpy (ctx->padlockkey, key, keylen); } #endif #ifdef USE_SSSE3 else if (hwfeatures & HWF_INTEL_SSSE3) { ctx->encrypt_fn = _gcry_aes_ssse3_encrypt; ctx->decrypt_fn = _gcry_aes_ssse3_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_ssse3 = 1; + if (hd) + { + hd->bulk.cfb_enc = _gcry_aes_ssse3_cfb_enc; + hd->bulk.cfb_dec = _gcry_aes_ssse3_cfb_dec; + hd->bulk.cbc_enc = _gcry_aes_ssse3_cbc_enc; + hd->bulk.cbc_dec = _gcry_aes_ssse3_cbc_dec; + hd->bulk.ctr_enc = _gcry_aes_ssse3_ctr_enc; + hd->bulk.ocb_crypt = _gcry_aes_ssse3_ocb_crypt; + hd->bulk.ocb_auth = _gcry_aes_ssse3_ocb_auth; + } } #endif #ifdef USE_ARM_CE else if (hwfeatures & HWF_ARM_AES) { ctx->encrypt_fn = _gcry_aes_armv8_ce_encrypt; ctx->decrypt_fn = _gcry_aes_armv8_ce_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_arm_ce = 1; + if (hd) + { + hd->bulk.cfb_enc = _gcry_aes_armv8_ce_cfb_enc; + hd->bulk.cfb_dec = _gcry_aes_armv8_ce_cfb_dec; + hd->bulk.cbc_enc = _gcry_aes_armv8_ce_cbc_enc; + hd->bulk.cbc_dec = _gcry_aes_armv8_ce_cbc_dec; + hd->bulk.ctr_enc = _gcry_aes_armv8_ce_ctr_enc; + hd->bulk.ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt; + hd->bulk.ocb_auth = _gcry_aes_armv8_ce_ocb_auth; + hd->bulk.xts_crypt = _gcry_aes_armv8_ce_xts_crypt; + } } #endif else { ctx->encrypt_fn = do_encrypt; ctx->decrypt_fn = do_decrypt; ctx->prefetch_enc_fn = prefetch_enc; ctx->prefetch_dec_fn = prefetch_dec; } /* NB: We don't yet support Padlock hardware key generation. */ if (0) { ; } #ifdef USE_AESNI else if (ctx->use_aesni) _gcry_aes_aesni_do_setkey (ctx, key); #endif #ifdef USE_SSSE3 else if (ctx->use_ssse3) _gcry_aes_ssse3_do_setkey (ctx, key); #endif #ifdef USE_ARM_CE else if (ctx->use_arm_ce) _gcry_aes_armv8_ce_setkey (ctx, key); #endif else { const byte *sbox = ((const byte *)encT) + 1; union { PROPERLY_ALIGNED_TYPE dummy; byte data[MAXKC][4]; u32 data32[MAXKC]; } tkk[2]; #define k tkk[0].data #define k_u32 tkk[0].data32 #define tk tkk[1].data #define tk_u32 tkk[1].data32 #define W (ctx->keyschenc) #define W_u32 (ctx->keyschenc32) prefetch_enc(); for (i = 0; i < keylen; i++) { k[i >> 2][i & 3] = key[i]; } for (j = KC-1; j >= 0; j--) { tk_u32[j] = k_u32[j]; } r = 0; t = 0; /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } while (r < rounds + 1) { /* While not enough round key material calculated calculate new values. */ tk[0][0] ^= sbox[tk[KC-1][1] * 4]; tk[0][1] ^= sbox[tk[KC-1][2] * 4]; tk[0][2] ^= sbox[tk[KC-1][3] * 4]; tk[0][3] ^= sbox[tk[KC-1][0] * 4]; tk[0][0] ^= rcon[rconpointer++]; if (KC != 8) { for (j = 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } else { for (j = 1; j < KC/2; j++) { tk_u32[j] ^= tk_u32[j-1]; } tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4]; tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4]; tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4]; tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4]; for (j = KC/2 + 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } } #undef W #undef tk #undef k #undef W_u32 #undef tk_u32 #undef k_u32 wipememory(&tkk, sizeof(tkk)); } return 0; } static gcry_err_code_t rijndael_setkey (void *context, const byte *key, const unsigned keylen, gcry_cipher_hd_t hd) { RIJNDAEL_context *ctx = context; - (void)hd; - return do_setkey (ctx, key, keylen); + return do_setkey (ctx, key, keylen, hd); } /* Make a decryption key from an encryption key. */ static void prepare_decryption( RIJNDAEL_context *ctx ) { int r; if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_prepare_decryption (ctx); } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_prepare_decryption (ctx); } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_prepare_decryption (ctx); } #endif /*USE_SSSE3*/ #ifdef USE_PADLOCK else if (ctx->use_padlock) { /* Padlock does not need decryption subkeys. */ } #endif /*USE_PADLOCK*/ else { const byte *sbox = ((const byte *)encT) + 1; prefetch_enc(); prefetch_dec(); ctx->keyschdec32[0][0] = ctx->keyschenc32[0][0]; ctx->keyschdec32[0][1] = ctx->keyschenc32[0][1]; ctx->keyschdec32[0][2] = ctx->keyschenc32[0][2]; ctx->keyschdec32[0][3] = ctx->keyschenc32[0][3]; for (r = 1; r < ctx->rounds; r++) { u32 *wi = ctx->keyschenc32[r]; u32 *wo = ctx->keyschdec32[r]; u32 wt; wt = wi[0]; wo[0] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[1]; wo[1] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[2]; wo[2] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[3]; wo[3] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); } ctx->keyschdec32[r][0] = ctx->keyschenc32[r][0]; ctx->keyschdec32[r][1] = ctx->keyschenc32[r][1]; ctx->keyschdec32[r][2] = ctx->keyschenc32[r][2]; ctx->keyschdec32[r][3] = ctx->keyschenc32[r][3]; } } #if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) /* Encrypt one block. A and B may be the same. */ static unsigned int do_encrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { #define rk (ctx->keyschenc32) const byte *sbox = ((const byte *)encT) + 1; int rounds = ctx->rounds; int r; u32 sa[4]; u32 sb[4]; sb[0] = buf_get_le32(a + 0); sb[1] = buf_get_le32(a + 4); sb[2] = buf_get_le32(a + 8); sb[3] = buf_get_le32(a + 12); sa[0] = sb[0] ^ rk[0][0]; sa[1] = sb[1] ^ rk[0][1]; sa[2] = sb[2] ^ rk[0][2]; sa[3] = sb[3] ^ rk[0][3]; sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[1][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[1][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[1][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[1][3] ^ sb[3]; for (r = 2; r < rounds; r++) { sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; r++; sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; } /* Last round is special. */ sb[0] = (sbox[(byte)(sa[0] >> (0 * 8)) * 4]) << (0 * 8); sb[3] = (sbox[(byte)(sa[0] >> (1 * 8)) * 4]) << (1 * 8); sb[2] = (sbox[(byte)(sa[0] >> (2 * 8)) * 4]) << (2 * 8); sb[1] = (sbox[(byte)(sa[0] >> (3 * 8)) * 4]) << (3 * 8); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= (sbox[(byte)(sa[1] >> (0 * 8)) * 4]) << (0 * 8); sa[0] ^= (sbox[(byte)(sa[1] >> (1 * 8)) * 4]) << (1 * 8); sb[3] ^= (sbox[(byte)(sa[1] >> (2 * 8)) * 4]) << (2 * 8); sb[2] ^= (sbox[(byte)(sa[1] >> (3 * 8)) * 4]) << (3 * 8); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= (sbox[(byte)(sa[2] >> (0 * 8)) * 4]) << (0 * 8); sa[1] ^= (sbox[(byte)(sa[2] >> (1 * 8)) * 4]) << (1 * 8); sa[0] ^= (sbox[(byte)(sa[2] >> (2 * 8)) * 4]) << (2 * 8); sb[3] ^= (sbox[(byte)(sa[2] >> (3 * 8)) * 4]) << (3 * 8); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= (sbox[(byte)(sa[3] >> (0 * 8)) * 4]) << (0 * 8); sa[2] ^= (sbox[(byte)(sa[3] >> (1 * 8)) * 4]) << (1 * 8); sa[1] ^= (sbox[(byte)(sa[3] >> (2 * 8)) * 4]) << (2 * 8); sa[0] ^= (sbox[(byte)(sa[3] >> (3 * 8)) * 4]) << (3 * 8); sa[3] = rk[r][3] ^ sb[3]; buf_put_le32(b + 0, sa[0]); buf_put_le32(b + 4, sa[1]); buf_put_le32(b + 8, sa[2]); buf_put_le32(b + 12, sa[3]); #undef rk return (56 + 2*sizeof(int)); } #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT); #elif defined(USE_ARM_ASM) return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT); #else return do_encrypt_fn (ctx, bx, ax); #endif /* !USE_ARM_ASM && !USE_AMD64_ASM*/ } static unsigned int rijndael_encrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); return ctx->encrypt_fn (ctx, b, a); } /* Bulk encryption of complete blocks in CFB mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; - if (ctx->prefetch_enc_fn) - ctx->prefetch_enc_fn(); - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { - _gcry_aes_aesni_cfb_enc (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_aesni_cfb_enc (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { - _gcry_aes_ssse3_cfb_enc (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_ssse3_cfb_enc (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { - _gcry_aes_armv8_ce_cfb_enc (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_armv8_ce_cfb_enc (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_ARM_CE*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + for ( ;nblocks; nblocks-- ) { /* Encrypt the IV. */ burn_depth = encrypt_fn (ctx, iv, iv); /* XOR the input with the IV and store input into IV. */ buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption of complete blocks in CBC mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char *last_iv; unsigned int burn_depth = 0; - if (ctx->prefetch_enc_fn) - ctx->prefetch_enc_fn(); - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { - _gcry_aes_aesni_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac); - burn_depth = 0; + _gcry_aes_aesni_cbc_enc (ctx, iv, outbuf, inbuf, nblocks, cbc_mac); + return; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { - _gcry_aes_ssse3_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac); - burn_depth = 0; + _gcry_aes_ssse3_cbc_enc (ctx, iv, outbuf, inbuf, nblocks, cbc_mac); + return; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { - _gcry_aes_armv8_ce_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac); - burn_depth = 0; + _gcry_aes_armv8_ce_cbc_enc (ctx, iv, outbuf, inbuf, nblocks, cbc_mac); + return; } #endif /*USE_ARM_CE*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + last_iv = iv; for ( ;nblocks; nblocks-- ) { buf_xor(outbuf, inbuf, last_iv, BLOCKSIZE); burn_depth = encrypt_fn (ctx, outbuf, outbuf); last_iv = outbuf; inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } if (last_iv != iv) buf_cpy (iv, last_iv, BLOCKSIZE); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption of complete blocks in CTR mode. Caller needs to make sure that CTR is aligned on a 16 byte boundary if AESNI; the minimum alignment is for an u32. This function is only intended for the bulk encryption feature of cipher.c. CTR is expected to be of size BLOCKSIZE. */ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; int i; - if (ctx->prefetch_enc_fn) - ctx->prefetch_enc_fn(); - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { - _gcry_aes_aesni_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks); - burn_depth = 0; + _gcry_aes_aesni_ctr_enc (ctx, ctr, outbuf, inbuf, nblocks); + return; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { - _gcry_aes_ssse3_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks); - burn_depth = 0; + _gcry_aes_ssse3_ctr_enc (ctx, ctr, outbuf, inbuf, nblocks); + return; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { - _gcry_aes_armv8_ce_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks); - burn_depth = 0; + _gcry_aes_armv8_ce_ctr_enc (ctx, ctr, outbuf, inbuf, nblocks); + return; } #endif /*USE_ARM_CE*/ else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + for ( ;nblocks; nblocks-- ) { /* Encrypt the counter. */ burn_depth = encrypt_fn (ctx, tmp.x1, ctr); /* XOR the input with the encrypted counter and store in output. */ buf_xor(outbuf, tmp.x1, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; /* Increment the counter. */ for (i = BLOCKSIZE; i > 0; i--) { ctr[i-1]++; if (ctr[i-1]) break; } } wipememory(&tmp, sizeof(tmp)); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } #if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) /* Decrypt one block. A and B may be the same. */ static unsigned int do_decrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { #define rk (ctx->keyschdec32) int rounds = ctx->rounds; int r; u32 sa[4]; u32 sb[4]; sb[0] = buf_get_le32(a + 0); sb[1] = buf_get_le32(a + 4); sb[2] = buf_get_le32(a + 8); sb[3] = buf_get_le32(a + 12); sa[0] = sb[0] ^ rk[rounds][0]; sa[1] = sb[1] ^ rk[rounds][1]; sa[2] = sb[2] ^ rk[rounds][2]; sa[3] = sb[3] ^ rk[rounds][3]; for (r = rounds - 1; r > 1; r--) { sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; r--; sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; } sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[1][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[1][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[1][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[1][3] ^ sb[3]; /* Last round is special. */ sb[0] = inv_sbox[(byte)(sa[0] >> (0 * 8))] << (0 * 8); sb[1] = inv_sbox[(byte)(sa[0] >> (1 * 8))] << (1 * 8); sb[2] = inv_sbox[(byte)(sa[0] >> (2 * 8))] << (2 * 8); sb[3] = inv_sbox[(byte)(sa[0] >> (3 * 8))] << (3 * 8); sa[0] = sb[0] ^ rk[0][0]; sb[1] ^= inv_sbox[(byte)(sa[1] >> (0 * 8))] << (0 * 8); sb[2] ^= inv_sbox[(byte)(sa[1] >> (1 * 8))] << (1 * 8); sb[3] ^= inv_sbox[(byte)(sa[1] >> (2 * 8))] << (2 * 8); sa[0] ^= inv_sbox[(byte)(sa[1] >> (3 * 8))] << (3 * 8); sa[1] = sb[1] ^ rk[0][1]; sb[2] ^= inv_sbox[(byte)(sa[2] >> (0 * 8))] << (0 * 8); sb[3] ^= inv_sbox[(byte)(sa[2] >> (1 * 8))] << (1 * 8); sa[0] ^= inv_sbox[(byte)(sa[2] >> (2 * 8))] << (2 * 8); sa[1] ^= inv_sbox[(byte)(sa[2] >> (3 * 8))] << (3 * 8); sa[2] = sb[2] ^ rk[0][2]; sb[3] ^= inv_sbox[(byte)(sa[3] >> (0 * 8))] << (0 * 8); sa[0] ^= inv_sbox[(byte)(sa[3] >> (1 * 8))] << (1 * 8); sa[1] ^= inv_sbox[(byte)(sa[3] >> (2 * 8))] << (2 * 8); sa[2] ^= inv_sbox[(byte)(sa[3] >> (3 * 8))] << (3 * 8); sa[3] = sb[3] ^ rk[0][3]; buf_put_le32(b + 0, sa[0]); buf_put_le32(b + 4, sa[1]); buf_put_le32(b + 8, sa[2]); buf_put_le32(b + 12, sa[3]); #undef rk return (56+2*sizeof(int)); } #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ /* Decrypt one block. AX and BX may be the same. */ static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, &dec_tables); #elif defined(USE_ARM_ASM) return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, &dec_tables); #else return do_decrypt_fn (ctx, bx, ax); #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ } static inline void check_decryption_preparation (RIJNDAEL_context *ctx) { if ( !ctx->decryption_prepared ) { prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } } static unsigned int rijndael_decrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); return ctx->decrypt_fn (ctx, b, a); } /* Bulk decryption of complete blocks in CFB mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; - if (ctx->prefetch_enc_fn) - ctx->prefetch_enc_fn(); - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { - _gcry_aes_aesni_cfb_dec (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_aesni_cfb_dec (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { - _gcry_aes_ssse3_cfb_dec (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_ssse3_cfb_dec (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { - _gcry_aes_armv8_ce_cfb_dec (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_armv8_ce_cfb_dec (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_ARM_CE*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + for ( ;nblocks; nblocks-- ) { burn_depth = encrypt_fn (ctx, iv, iv); buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk decryption of complete blocks in CBC mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; - check_decryption_preparation (ctx); - - if (ctx->prefetch_dec_fn) - ctx->prefetch_dec_fn(); - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { - _gcry_aes_aesni_cbc_dec (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_aesni_cbc_dec (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { - _gcry_aes_ssse3_cbc_dec (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_ssse3_cbc_dec (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { - _gcry_aes_armv8_ce_cbc_dec (ctx, outbuf, inbuf, iv, nblocks); - burn_depth = 0; + _gcry_aes_armv8_ce_cbc_dec (ctx, iv, outbuf, inbuf, nblocks); + return; } #endif /*USE_ARM_CE*/ else { unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16; rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn; + check_decryption_preparation (ctx); + + if (ctx->prefetch_dec_fn) + ctx->prefetch_dec_fn(); + for ( ;nblocks; nblocks-- ) { /* INBUF is needed later and it may be identical to OUTBUF, so store the intermediate result to SAVEBUF. */ burn_depth = decrypt_fn (ctx, savebuf, inbuf); buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } wipememory(savebuf, sizeof(savebuf)); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption/decryption of complete blocks in OCB mode. */ size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; - if (encrypt) - { - if (ctx->prefetch_enc_fn) - ctx->prefetch_enc_fn(); - } - else - { - check_decryption_preparation (ctx); - - if (ctx->prefetch_dec_fn) - ctx->prefetch_dec_fn(); - } - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { - _gcry_aes_aesni_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); - burn_depth = 0; + return _gcry_aes_aesni_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { - _gcry_aes_ssse3_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); - burn_depth = 0; + return _gcry_aes_ssse3_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { - _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); - burn_depth = 0; + return _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); } #endif /*USE_ARM_CE*/ else if (encrypt) { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE); buf_cpy (l_tmp.x1, inbuf, BLOCKSIZE); /* Checksum_i = Checksum_{i-1} xor P_i */ buf_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1); buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); buf_cpy (outbuf, l_tmp.x1, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } } else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn; + check_decryption_preparation (ctx); + + if (ctx->prefetch_dec_fn) + ctx->prefetch_dec_fn(); + for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE); buf_cpy (l_tmp.x1, inbuf, BLOCKSIZE); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); burn_depth = decrypt_fn (ctx, l_tmp.x1, l_tmp.x1); buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); /* Checksum_i = Checksum_{i-1} xor P_i */ buf_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE); buf_cpy (outbuf, l_tmp.x1, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); return 0; } /* Bulk authentication of complete blocks in OCB mode. */ size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; unsigned int burn_depth = 0; - if (ctx->prefetch_enc_fn) - ctx->prefetch_enc_fn(); - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { - _gcry_aes_aesni_ocb_auth (c, abuf, nblocks); - burn_depth = 0; + return _gcry_aes_aesni_ocb_auth (c, abuf, nblocks); } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { - _gcry_aes_ssse3_ocb_auth (c, abuf, nblocks); - burn_depth = 0; + return _gcry_aes_ssse3_ocb_auth (c, abuf, nblocks); } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { - _gcry_aes_armv8_ce_ocb_auth (c, abuf, nblocks); - burn_depth = 0; + return _gcry_aes_armv8_ce_ocb_auth (c, abuf, nblocks); } #endif /*USE_ARM_CE*/ else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.aad_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_mode.ocb.aad_offset, l, BLOCKSIZE); /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ buf_xor (l_tmp.x1, c->u_mode.ocb.aad_offset, abuf, BLOCKSIZE); burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1); buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp.x1, BLOCKSIZE); abuf += BLOCKSIZE; } wipememory(&l_tmp, sizeof(l_tmp)); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); return 0; } /* Bulk encryption/decryption of complete blocks in XTS mode. */ void _gcry_aes_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; rijndael_cryptfn_t crypt_fn; u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry; - if (encrypt) - { - if (ctx->prefetch_enc_fn) - ctx->prefetch_enc_fn(); - - crypt_fn = ctx->encrypt_fn; - } - else - { - check_decryption_preparation (ctx); - - if (ctx->prefetch_dec_fn) - ctx->prefetch_dec_fn(); - - crypt_fn = ctx->decrypt_fn; - } - if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt); - burn_depth = 0; + return; } #endif /*USE_AESNI*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt); - burn_depth = 0; + return; } #endif /*USE_ARM_CE*/ else { + if (encrypt) + { + if (ctx->prefetch_enc_fn) + ctx->prefetch_enc_fn(); + + crypt_fn = ctx->encrypt_fn; + } + else + { + check_decryption_preparation (ctx); + + if (ctx->prefetch_dec_fn) + ctx->prefetch_dec_fn(); + + crypt_fn = ctx->decrypt_fn; + } + tweak_next_lo = buf_get_le64 (tweak + 0); tweak_next_hi = buf_get_le64 (tweak + 8); while (nblocks) { tweak_lo = tweak_next_lo; tweak_hi = tweak_next_hi; /* Xor-Encrypt/Decrypt-Xor block. */ tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo; tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi; buf_put_le64 (outbuf + 0, tmp_lo); buf_put_le64 (outbuf + 8, tmp_hi); /* Generate next tweak. */ carry = -(tweak_next_hi >> 63) & 0x87; tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63); tweak_next_lo = (tweak_next_lo << 1) ^ carry; burn_depth = crypt_fn (ctx, outbuf, outbuf); buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo); buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi); outbuf += GCRY_XTS_BLOCK_LEN; inbuf += GCRY_XTS_BLOCK_LEN; nblocks--; } buf_put_le64 (tweak + 0, tweak_next_lo); buf_put_le64 (tweak + 8, tweak_next_hi); } if (burn_depth) _gcry_burn_stack (burn_depth + 5 * sizeof(void *)); } /* Run the self-tests for AES 128. Returns NULL on success. */ static const char* selftest_basic_128 (void) { RIJNDAEL_context *ctx; unsigned char *ctxmem; unsigned char scratch[16]; /* The test vectors are from the AES supplied ones; more or less randomly taken from ecb_tbl.txt (I=42,81,14) */ #if 1 static const unsigned char plaintext_128[16] = { 0x01,0x4B,0xAF,0x22,0x78,0xA6,0x9D,0x33, 0x1D,0x51,0x80,0x10,0x36,0x43,0xE9,0x9A }; static const unsigned char key_128[16] = { 0xE8,0xE9,0xEA,0xEB,0xED,0xEE,0xEF,0xF0, 0xF2,0xF3,0xF4,0xF5,0xF7,0xF8,0xF9,0xFA }; static const unsigned char ciphertext_128[16] = { 0x67,0x43,0xC3,0xD1,0x51,0x9A,0xB4,0xF2, 0xCD,0x9A,0x78,0xAB,0x09,0xA5,0x11,0xBD }; #else /* Test vectors from fips-197, appendix C. */ # warning debug test vectors in use static const unsigned char plaintext_128[16] = { 0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77, 0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff }; static const unsigned char key_128[16] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f /* 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, */ /* 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c */ }; static const unsigned char ciphertext_128[16] = { 0x69,0xc4,0xe0,0xd8,0x6a,0x7b,0x04,0x30, 0xd8,0xcd,0xb7,0x80,0x70,0xb4,0xc5,0x5a }; #endif /* Because gcc/ld can only align the CTX struct on 8 bytes on the stack, we need to allocate that context on the heap. */ ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem); if (!ctx) return "failed to allocate memory"; rijndael_setkey (ctx, key_128, sizeof (key_128), NULL); rijndael_encrypt (ctx, scratch, plaintext_128); if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128))) { xfree (ctxmem); return "AES-128 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); xfree (ctxmem); if (memcmp (scratch, plaintext_128, sizeof (plaintext_128))) return "AES-128 test decryption failed."; return NULL; } /* Run the self-tests for AES 192. Returns NULL on success. */ static const char* selftest_basic_192 (void) { RIJNDAEL_context *ctx; unsigned char *ctxmem; unsigned char scratch[16]; static unsigned char plaintext_192[16] = { 0x76,0x77,0x74,0x75,0xF1,0xF2,0xF3,0xF4, 0xF8,0xF9,0xE6,0xE7,0x77,0x70,0x71,0x72 }; static unsigned char key_192[24] = { 0x04,0x05,0x06,0x07,0x09,0x0A,0x0B,0x0C, 0x0E,0x0F,0x10,0x11,0x13,0x14,0x15,0x16, 0x18,0x19,0x1A,0x1B,0x1D,0x1E,0x1F,0x20 }; static const unsigned char ciphertext_192[16] = { 0x5D,0x1E,0xF2,0x0D,0xCE,0xD6,0xBC,0xBC, 0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA }; ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem); if (!ctx) return "failed to allocate memory"; rijndael_setkey (ctx, key_192, sizeof(key_192), NULL); rijndael_encrypt (ctx, scratch, plaintext_192); if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192))) { xfree (ctxmem); return "AES-192 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); xfree (ctxmem); if (memcmp (scratch, plaintext_192, sizeof (plaintext_192))) return "AES-192 test decryption failed."; return NULL; } /* Run the self-tests for AES 256. Returns NULL on success. */ static const char* selftest_basic_256 (void) { RIJNDAEL_context *ctx; unsigned char *ctxmem; unsigned char scratch[16]; static unsigned char plaintext_256[16] = { 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F, 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21 }; static unsigned char key_256[32] = { 0x08,0x09,0x0A,0x0B,0x0D,0x0E,0x0F,0x10, 0x12,0x13,0x14,0x15,0x17,0x18,0x19,0x1A, 0x1C,0x1D,0x1E,0x1F,0x21,0x22,0x23,0x24, 0x26,0x27,0x28,0x29,0x2B,0x2C,0x2D,0x2E }; static const unsigned char ciphertext_256[16] = { 0x08,0x0E,0x95,0x17,0xEB,0x16,0x77,0x71, 0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3 }; ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem); if (!ctx) return "failed to allocate memory"; rijndael_setkey (ctx, key_256, sizeof(key_256), NULL); rijndael_encrypt (ctx, scratch, plaintext_256); if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256))) { xfree (ctxmem); return "AES-256 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); xfree (ctxmem); if (memcmp (scratch, plaintext_256, sizeof (plaintext_256))) return "AES-256 test decryption failed."; return NULL; } /* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR encryption. Returns NULL on success. */ static const char* selftest_ctr_128 (void) { const int nblocks = 8+1; const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); return _gcry_selftest_helper_ctr("AES", &rijndael_setkey, &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize, context_size); } /* Run the self-tests for AES-CBC-128, tests bulk CBC decryption. Returns NULL on success. */ static const char* selftest_cbc_128 (void) { const int nblocks = 8+2; const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); return _gcry_selftest_helper_cbc("AES", &rijndael_setkey, &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize, context_size); } /* Run the self-tests for AES-CFB-128, tests bulk CFB decryption. Returns NULL on success. */ static const char* selftest_cfb_128 (void) { const int nblocks = 8+2; const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); return _gcry_selftest_helper_cfb("AES", &rijndael_setkey, &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize, context_size); } /* Run all the self-tests and return NULL on success. This function is used for the on-the-fly self-tests. */ static const char * selftest (void) { const char *r; if ( (r = selftest_basic_128 ()) || (r = selftest_basic_192 ()) || (r = selftest_basic_256 ()) ) return r; if ( (r = selftest_ctr_128 ()) ) return r; if ( (r = selftest_cbc_128 ()) ) return r; if ( (r = selftest_cfb_128 ()) ) return r; return r; } /* SP800-38a.pdf for AES-128. */ static const char * selftest_fips_128_38a (int requested_mode) { static const struct tv { int mode; const unsigned char key[16]; const unsigned char iv[16]; struct { const unsigned char input[16]; const unsigned char output[16]; } data[4]; } tv[2] = { { GCRY_CIPHER_MODE_CFB, /* F.3.13, CFB128-AES128 */ { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a }, { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20, 0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } }, { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0xc8, 0xa6, 0x45, 0x37, 0xa0, 0xb3, 0xa9, 0x3f, 0xcd, 0xe3, 0xcd, 0xad, 0x9f, 0x1c, 0xe5, 0x8b } }, { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x26, 0x75, 0x1f, 0x67, 0xa3, 0xcb, 0xb1, 0x40, 0xb1, 0x80, 0x8c, 0xf1, 0x87, 0xa4, 0xf4, 0xdf } }, { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 }, { 0xc0, 0x4b, 0x05, 0x35, 0x7c, 0x5d, 0x1c, 0x0e, 0xea, 0xc4, 0xc6, 0x6f, 0x9f, 0xf7, 0xf2, 0xe6 } } } }, { GCRY_CIPHER_MODE_OFB, { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a }, { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20, 0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } }, { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0x77, 0x89, 0x50, 0x8d, 0x16, 0x91, 0x8f, 0x03, 0xf5, 0x3c, 0x52, 0xda, 0xc5, 0x4e, 0xd8, 0x25 } }, { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x97, 0x40, 0x05, 0x1e, 0x9c, 0x5f, 0xec, 0xf6, 0x43, 0x44, 0xf7, 0xa8, 0x22, 0x60, 0xed, 0xcc } }, { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 }, { 0x30, 0x4c, 0x65, 0x28, 0xf6, 0x59, 0xc7, 0x78, 0x66, 0xa5, 0x10, 0xd9, 0xc1, 0xd6, 0xae, 0x5e } }, } } }; unsigned char scratch[16]; gpg_error_t err; int tvi, idx; gcry_cipher_hd_t hdenc = NULL; gcry_cipher_hd_t hddec = NULL; #define Fail(a) do { \ _gcry_cipher_close (hdenc); \ _gcry_cipher_close (hddec); \ return a; \ } while (0) gcry_assert (sizeof tv[0].data[0].input == sizeof scratch); gcry_assert (sizeof tv[0].data[0].output == sizeof scratch); for (tvi=0; tvi < DIM (tv); tvi++) if (tv[tvi].mode == requested_mode) break; if (tvi == DIM (tv)) Fail ("no test data for this mode"); err = _gcry_cipher_open (&hdenc, GCRY_CIPHER_AES, tv[tvi].mode, 0); if (err) Fail ("open"); err = _gcry_cipher_open (&hddec, GCRY_CIPHER_AES, tv[tvi].mode, 0); if (err) Fail ("open"); err = _gcry_cipher_setkey (hdenc, tv[tvi].key, sizeof tv[tvi].key); if (!err) err = _gcry_cipher_setkey (hddec, tv[tvi].key, sizeof tv[tvi].key); if (err) Fail ("set key"); err = _gcry_cipher_setiv (hdenc, tv[tvi].iv, sizeof tv[tvi].iv); if (!err) err = _gcry_cipher_setiv (hddec, tv[tvi].iv, sizeof tv[tvi].iv); if (err) Fail ("set IV"); for (idx=0; idx < DIM (tv[tvi].data); idx++) { err = _gcry_cipher_encrypt (hdenc, scratch, sizeof scratch, tv[tvi].data[idx].input, sizeof tv[tvi].data[idx].input); if (err) Fail ("encrypt command"); if (memcmp (scratch, tv[tvi].data[idx].output, sizeof scratch)) Fail ("encrypt mismatch"); err = _gcry_cipher_decrypt (hddec, scratch, sizeof scratch, tv[tvi].data[idx].output, sizeof tv[tvi].data[idx].output); if (err) Fail ("decrypt command"); if (memcmp (scratch, tv[tvi].data[idx].input, sizeof scratch)) Fail ("decrypt mismatch"); } #undef Fail _gcry_cipher_close (hdenc); _gcry_cipher_close (hddec); return NULL; } /* Complete selftest for AES-128 with all modes and driver code. */ static gpg_err_code_t selftest_fips_128 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "low-level"; errtxt = selftest_basic_128 (); if (errtxt) goto failed; if (extended) { what = "cfb"; errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_CFB); if (errtxt) goto failed; what = "ofb"; errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_OFB); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES128, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Complete selftest for AES-192. */ static gpg_err_code_t selftest_fips_192 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; (void)extended; /* No extended tests available. */ what = "low-level"; errtxt = selftest_basic_192 (); if (errtxt) goto failed; return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES192, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Complete selftest for AES-256. */ static gpg_err_code_t selftest_fips_256 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; (void)extended; /* No extended tests available. */ what = "low-level"; errtxt = selftest_basic_256 (); if (errtxt) goto failed; return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES256, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t run_selftests (int algo, int extended, selftest_report_func_t report) { gpg_err_code_t ec; switch (algo) { case GCRY_CIPHER_AES128: ec = selftest_fips_128 (extended, report); break; case GCRY_CIPHER_AES192: ec = selftest_fips_192 (extended, report); break; case GCRY_CIPHER_AES256: ec = selftest_fips_256 (extended, report); break; default: ec = GPG_ERR_CIPHER_ALGO; break; } return ec; } static const char *rijndael_names[] = { "RIJNDAEL", "AES128", "AES-128", NULL }; static gcry_cipher_oid_spec_t rijndael_oids[] = { { "2.16.840.1.101.3.4.1.1", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.2", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.3", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.4", GCRY_CIPHER_MODE_CFB }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes = { GCRY_CIPHER_AES, {0, 1}, "AES", rijndael_names, rijndael_oids, 16, 128, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests }; static const char *rijndael192_names[] = { "RIJNDAEL192", "AES-192", NULL }; static gcry_cipher_oid_spec_t rijndael192_oids[] = { { "2.16.840.1.101.3.4.1.21", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.22", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.23", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.24", GCRY_CIPHER_MODE_CFB }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes192 = { GCRY_CIPHER_AES192, {0, 1}, "AES192", rijndael192_names, rijndael192_oids, 16, 192, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests }; static const char *rijndael256_names[] = { "RIJNDAEL256", "AES-256", NULL }; static gcry_cipher_oid_spec_t rijndael256_oids[] = { { "2.16.840.1.101.3.4.1.41", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.42", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.43", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.44", GCRY_CIPHER_MODE_CFB }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes256 = { GCRY_CIPHER_AES256, {0, 1}, "AES256", rijndael256_names, rijndael256_oids, 16, 256, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests };