diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 747ef662..95ec4c2b 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -1,3944 +1,3965 @@ /* AES-NI accelerated AES for Libgcrypt * Copyright (C) 2000, 2001, 2002, 2003, 2007, * 2008, 2011, 2012 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_AESNI #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */ /* Prevent compiler from issuing SSE instructions between asm blocks. */ # pragma GCC target("no-sse") #endif #if __clang__ # pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function) #endif #define ALWAYS_INLINE inline __attribute__((always_inline)) #define NO_INLINE __attribute__((noinline)) #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE typedef struct u128_s { u32 a, b, c, d; } __attribute__((packed, aligned(1), may_alias)) u128_t; /* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l because of 'pragma target'. */ static ASM_FUNC_ATTR_INLINE const unsigned char * aes_ocb_get_l (gcry_cipher_hd_t c, u64 n) { unsigned long ntz; /* Assumes that N != 0. */ asm ("rep;bsfl %k[low], %k[ntz]\n\t" : [ntz] "=r" (ntz) : [low] "r" ((unsigned long)n) : "cc"); return c->u_mode.ocb.L[ntz]; } /* Two macros to be called prior and after the use of AESNI instructions. There should be no external function calls between the use of these macros. There purpose is to make sure that the SSE regsiters are cleared and won't reveal any information about the key or the data. */ #ifdef __WIN64__ /* XMM6-XMM15 are callee-saved registers on WIN64. */ # define aesni_prepare_2_7_variable char win64tmp[16 * 2] # define aesni_prepare_8_15_variable char win64tmp8_15[16 * 8] # define aesni_prepare() do { } while (0) # define aesni_prepare_2_7() \ do { asm volatile ("movdqu %%xmm6, %0\n\t" \ "movdqu %%xmm7, %1\n\t" \ : "=m" (*win64tmp), "=m" (*(win64tmp+16)) \ : \ : "memory"); \ } while (0) # define aesni_prepare_8_15() \ do { asm volatile ("movdqu %%xmm8, 0*16(%0)\n\t" \ "movdqu %%xmm9, 1*16(%0)\n\t" \ "movdqu %%xmm10, 2*16(%0)\n\t" \ "movdqu %%xmm11, 3*16(%0)\n\t" \ "movdqu %%xmm12, 4*16(%0)\n\t" \ "movdqu %%xmm13, 5*16(%0)\n\t" \ "movdqu %%xmm14, 6*16(%0)\n\t" \ "movdqu %%xmm15, 7*16(%0)\n\t" \ : \ : "r" (win64tmp8_15) \ : "memory"); \ } while (0) # define aesni_cleanup() \ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ "pxor %%xmm1, %%xmm1\n" :: ); \ } while (0) # define aesni_cleanup_2_7() \ do { asm volatile ("movdqu %0, %%xmm6\n\t" \ "movdqu %1, %%xmm7\n\t" \ "pxor %%xmm2, %%xmm2\n" \ "pxor %%xmm3, %%xmm3\n" \ "pxor %%xmm4, %%xmm4\n" \ "pxor %%xmm5, %%xmm5\n" \ : \ : "m" (*win64tmp), "m" (*(win64tmp+16)) \ : "memory"); \ } while (0) # define aesni_cleanup_8_15() \ do { asm volatile ("movdqu 0*16(%0), %%xmm8\n\t" \ "movdqu 1*16(%0), %%xmm9\n\t" \ "movdqu 2*16(%0), %%xmm10\n\t" \ "movdqu 3*16(%0), %%xmm11\n\t" \ "movdqu 4*16(%0), %%xmm12\n\t" \ "movdqu 5*16(%0), %%xmm13\n\t" \ "movdqu 6*16(%0), %%xmm14\n\t" \ "movdqu 7*16(%0), %%xmm15\n\t" \ : \ : "r" (win64tmp8_15) \ : "memory"); \ } while (0) #else # define aesni_prepare_2_7_variable # define aesni_prepare() do { } while (0) # define aesni_prepare_2_7() do { } while (0) # define aesni_cleanup() \ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ "pxor %%xmm1, %%xmm1\n" :: ); \ } while (0) # define aesni_cleanup_2_7() \ do { asm volatile ("pxor %%xmm7, %%xmm7\n\t" \ "pxor %%xmm2, %%xmm2\n\t" \ "pxor %%xmm3, %%xmm3\n" \ "pxor %%xmm4, %%xmm4\n" \ "pxor %%xmm5, %%xmm5\n" \ "pxor %%xmm6, %%xmm6\n":: ); \ } while (0) # ifdef __x86_64__ # define aesni_prepare_8_15_variable # define aesni_prepare_8_15() do { } while (0) # define aesni_cleanup_8_15() \ do { asm volatile ("pxor %%xmm8, %%xmm8\n" \ "pxor %%xmm9, %%xmm9\n" \ "pxor %%xmm10, %%xmm10\n" \ "pxor %%xmm11, %%xmm11\n" \ "pxor %%xmm12, %%xmm12\n" \ "pxor %%xmm13, %%xmm13\n" \ "pxor %%xmm14, %%xmm14\n" \ "pxor %%xmm15, %%xmm15\n":: ); \ } while (0) # endif #endif void ASM_FUNC_ATTR _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key) { aesni_prepare_2_7_variable; aesni_prepare(); aesni_prepare_2_7(); if (ctx->rounds < 12) { /* 128-bit key */ #define AESKEYGENASSIST_xmm1_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" #define AESKEY_EXPAND128 \ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm1, %%xmm3\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm1\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm1\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm2\n\t" \ "pxor %%xmm2, %%xmm1\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x01) AESKEY_EXPAND128 "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x02) AESKEY_EXPAND128 "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x04) AESKEY_EXPAND128 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x08) AESKEY_EXPAND128 "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x10) AESKEY_EXPAND128 "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x20) AESKEY_EXPAND128 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x40) AESKEY_EXPAND128 "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x80) AESKEY_EXPAND128 "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x1b) AESKEY_EXPAND128 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x36) AESKEY_EXPAND128 "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm1_xmm2 #undef AESKEY_EXPAND128 } else if (ctx->rounds == 12) { /* 192-bit key */ #define AESKEYGENASSIST_xmm3_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" #define AESKEY_EXPAND192 \ "pshufd $0x55, %%xmm2, %%xmm2\n\t" \ "movdqu %%xmm1, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pxor %%xmm2, %%xmm1\n\t" \ "pshufd $0xff, %%xmm1, %%xmm2\n\t" \ "movdqu %%xmm3, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pxor %%xmm2, %%xmm3\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ "movq 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..23] */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x01) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x02) AESKEY_EXPAND192 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x04) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x08) AESKEY_EXPAND192 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x10) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x20) AESKEY_EXPAND192 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x40) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x80) AESKEY_EXPAND192 "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm3_xmm2 #undef AESKEY_EXPAND192 } else if (ctx->rounds > 12) { /* 256-bit key */ #define AESKEYGENASSIST_xmm1_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" #define AESKEYGENASSIST_xmm3_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" #define AESKEY_EXPAND256_A \ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm1, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pxor %%xmm2, %%xmm1\n\t" #define AESKEY_EXPAND256_B \ "pshufd $0xaa, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm3, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pxor %%xmm2, %%xmm3\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ "movdqu 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..31] */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x01) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x02) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x04) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x08) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x10) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x20) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x40) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm1_xmm2 #undef AESKEYGENASSIST_xmm3_xmm2 #undef AESKEY_EXPAND256_A #undef AESKEY_EXPAND256_B } aesni_cleanup(); aesni_cleanup_2_7(); } /* Make a decryption key from an encryption key. */ static ASM_FUNC_ATTR_INLINE void do_aesni_prepare_decryption (RIJNDAEL_context *ctx) { /* The AES-NI decrypt instructions use the Equivalent Inverse Cipher, thus we can't use the the standard decrypt key preparation. */ u128_t *ekey = (u128_t *)ctx->keyschenc; u128_t *dkey = (u128_t *)ctx->keyschdec; int rr; int r; #define DO_AESNI_AESIMC() \ asm volatile ("movdqa %[ekey], %%xmm1\n\t" \ /*"aesimc %%xmm1, %%xmm1\n\t"*/ \ ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \ "movdqa %%xmm1, %[dkey]" \ : [dkey] "=m" (dkey[r]) \ : [ekey] "m" (ekey[rr]) \ : "memory") dkey[0] = ekey[ctx->rounds]; r=1; rr=ctx->rounds-1; DO_AESNI_AESIMC(); r++; rr--; /* round 1 */ DO_AESNI_AESIMC(); r++; rr--; /* round 2 */ DO_AESNI_AESIMC(); r++; rr--; /* round 3 */ DO_AESNI_AESIMC(); r++; rr--; /* round 4 */ DO_AESNI_AESIMC(); r++; rr--; /* round 5 */ DO_AESNI_AESIMC(); r++; rr--; /* round 6 */ DO_AESNI_AESIMC(); r++; rr--; /* round 7 */ DO_AESNI_AESIMC(); r++; rr--; /* round 8 */ DO_AESNI_AESIMC(); r++; rr--; /* round 9 */ if (ctx->rounds > 10) { DO_AESNI_AESIMC(); r++; rr--; /* round 10 */ DO_AESNI_AESIMC(); r++; rr--; /* round 11 */ if (ctx->rounds > 12) { DO_AESNI_AESIMC(); r++; rr--; /* round 12 */ DO_AESNI_AESIMC(); r++; rr--; /* round 13 */ } } dkey[r] = ekey[0]; #undef DO_AESNI_AESIMC } void ASM_FUNC_ATTR _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx) { aesni_prepare(); do_aesni_prepare_decryption (ctx); aesni_cleanup(); } /* Encrypt one block using the Intel AES-NI instructions. Block is input * and output through SSE register xmm0. */ static ASM_FUNC_ATTR_INLINE void do_aesni_enc (const RIJNDAEL_context *ctx) { #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" asm volatile ("movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 "\n" : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm1_xmm0 #undef aesenclast_xmm1_xmm0 } /* Decrypt one block using the Intel AES-NI instructions. Block is input * and output through SSE register xmm0. */ static ASM_FUNC_ATTR_INLINE void do_aesni_dec (const RIJNDAEL_context *ctx) { #define aesdec_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t" #define aesdeclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t" asm volatile ("movdqa (%[key]), %%xmm1\n\t" "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Ldeclast%=:\n\t" aesdeclast_xmm1_xmm0 "\n" : : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesdec_xmm1_xmm0 #undef aesdeclast_xmm1_xmm0 } /* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4. */ static ASM_FUNC_ATTR_INLINE void do_aesni_enc_vec4 (const RIJNDAEL_context *ctx) { #define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t" #define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t" #define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t" #define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t" #define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t" #define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t" #define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t" #define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t" asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x20(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x30(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x40(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x50(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x60(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x70(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x80(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x90(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xa0(%[key]), %%xmm0\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xb0(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xc0(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xd0(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" aesenclast_xmm0_xmm1 aesenclast_xmm0_xmm2 aesenclast_xmm0_xmm3 aesenclast_xmm0_xmm4 : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm0_xmm1 #undef aesenc_xmm0_xmm2 #undef aesenc_xmm0_xmm3 #undef aesenc_xmm0_xmm4 #undef aesenclast_xmm0_xmm1 #undef aesenclast_xmm0_xmm2 #undef aesenclast_xmm0_xmm3 #undef aesenclast_xmm0_xmm4 } /* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4. */ static ASM_FUNC_ATTR_INLINE void do_aesni_dec_vec4 (const RIJNDAEL_context *ctx) { #define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t" #define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t" #define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t" #define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t" #define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t" #define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t" #define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t" #define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t" asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x20(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x30(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x40(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x50(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x60(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x70(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x80(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x90(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xa0(%[key]), %%xmm0\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xb0(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xc0(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xd0(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" aesdeclast_xmm0_xmm1 aesdeclast_xmm0_xmm2 aesdeclast_xmm0_xmm3 aesdeclast_xmm0_xmm4 : /* no output */ : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesdec_xmm0_xmm1 #undef aesdec_xmm0_xmm2 #undef aesdec_xmm0_xmm3 #undef aesdec_xmm0_xmm4 #undef aesdeclast_xmm0_xmm1 #undef aesdeclast_xmm0_xmm2 #undef aesdeclast_xmm0_xmm3 #undef aesdeclast_xmm0_xmm4 } #ifdef __x86_64__ /* Encrypt eight blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ static ASM_FUNC_ATTR_INLINE void do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) { asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "jb .Ldeclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "je .Ldeclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); } /* Decrypt eight blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ static ASM_FUNC_ATTR_INLINE void do_aesni_dec_vec8 (const RIJNDAEL_context *ctx) { asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "jb .Ldeclast%=\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "je .Ldeclast%=\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" : /* no output */ : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); } #endif /* __x86_64__ */ /* Perform a CTR encryption round using the counter CTR and the input block A. Write the result to the output block B and update CTR. CTR needs to be a 16 byte aligned little-endian value. */ static ASM_FUNC_ATTR_INLINE void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm5\n\t" "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ (big endian) */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "cmpl $0xffffffff, 12(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "psubq %%xmm1, %%xmm5\n\t" /* add carry to upper 64bits */ ".Lno_carry%=:\n\t" "pshufb %%xmm6, %%xmm5\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ "pxor (%[key]), %%xmm0\n\t" /* xmm1 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 "movdqu %[src], %%xmm1\n\t" /* xmm1 := input */ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */ "movdqu %%xmm0, %[dst]" /* Store EncCTR. */ : [dst] "=m" (*b) : [src] "m" (*a), [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [rounds] "g" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm1_xmm0 #undef aesenclast_xmm1_xmm0 } /* Four blocks at a time variant of do_aesni_ctr. */ static ASM_FUNC_ATTR_INLINE void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { static const byte bige_addb_const[4][16] __attribute__ ((aligned (16))) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 } }; const void *bige_addb = bige_addb_const; #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenc_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t" #define aesenc_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t" #define aesenc_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" #define aesenclast_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t" #define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t" #define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t" /* Register usage: [key] keyschedule xmm0 CTR-0 xmm1 temp / round key xmm2 CTR-1 xmm3 CTR-2 xmm4 CTR-3 xmm5 copy of *ctr xmm6 endian swapping mask */ asm volatile (/* detect if 8-bit carry handling is needed */ "addb $4, 15(%[ctr])\n\t" "jc .Ladd32bit%=\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */ "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */ "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */ "movdqa 3*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(4) */ "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */ "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */ "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */ "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(4) + CTR (xmm0) */ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "jmp .Ldone_ctr%=\n\t" ".Ladd32bit%=:\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Restore CTR. */ "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ "movdqa %%xmm0, %%xmm2\n\t" "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ "movdqa %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "movl 12(%[ctr]), %%esi\n\t" "bswapl %%esi\n\t" "cmpl $0xfffffffc, %%esi\n\t" "jb .Lno_carry%=\n\t" /* no carry */ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */ "cmpl $0xfffffffe, %%esi\n\t" "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ /* esi == 0xffffffff */ "psubq %%xmm1, %%xmm2\n\t" ".Lcarry_xmm3%=:\n\t" "psubq %%xmm1, %%xmm3\n\t" ".Lcarry_xmm4%=:\n\t" "psubq %%xmm1, %%xmm4\n\t" ".Lcarry_xmm5%=:\n\t" "psubq %%xmm1, %%xmm5\n\t" ".Lno_carry%=:\n\t" "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ ".Ldone_ctr%=:\n\t" : : [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [addb] "r" (bige_addb) : "%esi", "cc", "memory"); asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 aesenclast_xmm1_xmm2 aesenclast_xmm1_xmm3 aesenclast_xmm1_xmm4 : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); asm volatile ("movdqu (%[src]), %%xmm1\n\t" /* Get block 1. */ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */ "movdqu %%xmm0, (%[dst])\n\t" /* Store block 1 */ "movdqu 16(%[src]), %%xmm1\n\t" /* Get block 2. */ "pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */ "movdqu %%xmm2, 16(%[dst])\n\t" /* Store block 2. */ "movdqu 32(%[src]), %%xmm1\n\t" /* Get block 3. */ "pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */ "movdqu %%xmm3, 32(%[dst])\n\t" /* Store block 3. */ "movdqu 48(%[src]), %%xmm1\n\t" /* Get block 4. */ "pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */ "movdqu %%xmm4, 48(%[dst])" /* Store block 4. */ : : [src] "r" (a), [dst] "r" (b) : "memory"); #undef aesenc_xmm1_xmm0 #undef aesenc_xmm1_xmm2 #undef aesenc_xmm1_xmm3 #undef aesenc_xmm1_xmm4 #undef aesenclast_xmm1_xmm0 #undef aesenclast_xmm1_xmm2 #undef aesenclast_xmm1_xmm3 #undef aesenclast_xmm1_xmm4 } #ifdef __x86_64__ /* Eight blocks at a time variant of do_aesni_ctr. */ static ASM_FUNC_ATTR_INLINE void do_aesni_ctr_8 (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 } }; const void *bige_addb = bige_addb_const; /* Register usage: [key] keyschedule xmm0 CTR-0 xmm1 temp / round key xmm2 CTR-1 xmm3 CTR-2 xmm4 CTR-3 xmm5 copy of *ctr xmm6 endian swapping mask xmm8 CTR-4 xmm9 CTR-5 xmm10 CTR-6 xmm11 CTR-7 xmm12 temp xmm13 temp xmm14 temp xmm15 temp */ asm volatile (/* detect if 8-bit carry handling is needed */ "addb $8, 15(%[ctr])\n\t" "jc .Ladd32bit%=\n\t" "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */ "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "movdqa %%xmm5, %%xmm2\n\t" /* xmm2 := CTR (xmm5) */ "movdqa %%xmm5, %%xmm3\n\t" /* xmm3 := CTR (xmm5) */ "movdqa %%xmm5, %%xmm4\n\t" /* xmm4 := CTR (xmm5) */ "paddb 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) + CTR */ "paddb 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) + CTR */ "paddb 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) + CTR */ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ "aesenc %%xmm7, %%xmm0\n\t" "aesenc %%xmm7, %%xmm2\n\t" "aesenc %%xmm7, %%xmm3\n\t" "aesenc %%xmm7, %%xmm4\n\t" "movdqa %%xmm5, %%xmm8\n\t" /* xmm8 := CTR (xmm5) */ "movdqa %%xmm5, %%xmm9\n\t" /* xmm9 := CTR (xmm5) */ "movdqa %%xmm5, %%xmm10\n\t" /* xmm10 := CTR (xmm5) */ "movdqa %%xmm5, %%xmm11\n\t" /* xmm11 := CTR (xmm5) */ "paddb 3*16(%[addb]), %%xmm8\n\t" /* xmm8 := be(4) + CTR */ "paddb 4*16(%[addb]), %%xmm9\n\t" /* xmm9 := be(5) + CTR */ "paddb 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) + CTR */ "paddb 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) + CTR */ "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */ "aesenc %%xmm7, %%xmm8\n\t" "aesenc %%xmm7, %%xmm9\n\t" "aesenc %%xmm7, %%xmm10\n\t" "aesenc %%xmm7, %%xmm11\n\t" "paddb 7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) + CTR */ "jmp .Ldone_ctr%=\n\t" ".Ladd32bit%=:\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Restore CTR. */ "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ "movdqa %%xmm0, %%xmm2\n\t" "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ "movdqa %%xmm4, %%xmm8\n\t" /* xmm8 := xmm4 */ "psubq %%xmm1, %%xmm8\n\t" /* xmm8++ */ "movdqa %%xmm8, %%xmm9\n\t" /* xmm9 := xmm8 */ "psubq %%xmm1, %%xmm9\n\t" /* xmm9++ */ "movdqa %%xmm9, %%xmm10\n\t" /* xmm10 := xmm9 */ "psubq %%xmm1, %%xmm10\n\t" /* xmm10++ */ "movdqa %%xmm10, %%xmm11\n\t" /* xmm11 := xmm10 */ "psubq %%xmm1, %%xmm11\n\t" /* xmm11++ */ "movdqa %%xmm11, %%xmm5\n\t" /* xmm5 := xmm11 */ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "movl 12(%[ctr]), %%esi\n\t" "bswapl %%esi\n\t" "cmpl $0xfffffff8, %%esi\n\t" "jb .Lno_carry%=\n\t" /* no carry */ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffff8 */ "cmpl $0xfffffffa, %%esi\n\t" "jb .Lcarry_xmm11%=\n\t" /* esi == 0xfffffff9 */ "je .Lcarry_xmm10%=\n\t" /* esi == 0xfffffffa */ "cmpl $0xfffffffc, %%esi\n\t" "jb .Lcarry_xmm9%=\n\t" /* esi == 0xfffffffb */ "je .Lcarry_xmm8%=\n\t" /* esi == 0xfffffffc */ "cmpl $0xfffffffe, %%esi\n\t" "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ /* esi == 0xffffffff */ "psubq %%xmm1, %%xmm2\n\t" ".Lcarry_xmm3%=:\n\t" "psubq %%xmm1, %%xmm3\n\t" ".Lcarry_xmm4%=:\n\t" "psubq %%xmm1, %%xmm4\n\t" ".Lcarry_xmm8%=:\n\t" "psubq %%xmm1, %%xmm8\n\t" ".Lcarry_xmm9%=:\n\t" "psubq %%xmm1, %%xmm9\n\t" ".Lcarry_xmm10%=:\n\t" "psubq %%xmm1, %%xmm10\n\t" ".Lcarry_xmm11%=:\n\t" "psubq %%xmm1, %%xmm11\n\t" ".Lcarry_xmm5%=:\n\t" "psubq %%xmm1, %%xmm5\n\t" ".Lno_carry%=:\n\t" "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ "aesenc %%xmm7, %%xmm0\n\t" "aesenc %%xmm7, %%xmm2\n\t" "aesenc %%xmm7, %%xmm3\n\t" "aesenc %%xmm7, %%xmm4\n\t" "pshufb %%xmm6, %%xmm8\n\t" /* xmm8 := be(xmm8) */ "pshufb %%xmm6, %%xmm9\n\t" /* xmm9 := be(xmm9) */ "pshufb %%xmm6, %%xmm10\n\t" /* xmm10 := be(xmm10) */ "pshufb %%xmm6, %%xmm11\n\t" /* xmm11 := be(xmm11) */ "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */ "aesenc %%xmm7, %%xmm8\n\t" "aesenc %%xmm7, %%xmm9\n\t" "aesenc %%xmm7, %%xmm10\n\t" "aesenc %%xmm7, %%xmm11\n\t" "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ ".align 16\n\t" ".Ldone_ctr%=:\n\t" : : [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [addb] "r" (bige_addb) : "%esi", "cc", "memory"); asm volatile ("movdqa 0x20(%[key]), %%xmm1\n\t" "movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1. */ "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2. */ "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3. */ "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4. */ "movdqu 4*16(%[src]), %%xmm7\n\t" /* Get block 5. */ "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "cmpl $12, %[rounds]\n\t" "movdqa 0x30(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm1\n\t" "jb .Lenclast%=\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm1\n\t" "je .Lenclast%=\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds), [src] "r" (a) : "cc", "memory"); asm volatile ("pxor %%xmm1, %%xmm12\n\t" /* block1 ^= lastkey */ "pxor %%xmm1, %%xmm13\n\t" /* block2 ^= lastkey */ "pxor %%xmm1, %%xmm14\n\t" /* block3 ^= lastkey */ "pxor %%xmm1, %%xmm15\n\t" /* block4 ^= lastkey */ "aesenclast %%xmm12, %%xmm0\n\t" "aesenclast %%xmm13, %%xmm2\n\t" "aesenclast %%xmm14, %%xmm3\n\t" "aesenclast %%xmm15, %%xmm4\n\t" "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6. */ "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7. */ "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8. */ "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1. */ "movdqu %%xmm2, 1*16(%[dst])\n\t" /* Store block 2. */ "movdqu %%xmm3, 2*16(%[dst])\n\t" /* Store block 3. */ "movdqu %%xmm4, 3*16(%[dst])\n\t" /* Store block 4. */ "pxor %%xmm1, %%xmm7\n\t" /* block5 ^= lastkey */ "pxor %%xmm1, %%xmm12\n\t" /* block6 ^= lastkey */ "pxor %%xmm1, %%xmm13\n\t" /* block7 ^= lastkey */ "pxor %%xmm1, %%xmm14\n\t" /* block8 ^= lastkey */ "aesenclast %%xmm7, %%xmm8\n\t" "aesenclast %%xmm12, %%xmm9\n\t" "aesenclast %%xmm13, %%xmm10\n\t" "aesenclast %%xmm14, %%xmm11\n\t" "movdqu %%xmm8, 4*16(%[dst])\n\t" /* Store block 8. */ "movdqu %%xmm9, 5*16(%[dst])\n\t" /* Store block 9. */ "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10. */ "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11. */ : : [src] "r" (a), [dst] "r" (b) : "memory"); } #endif /* __x86_64__ */ unsigned int ASM_FUNC_ATTR _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { aesni_prepare (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_aesni_enc (ctx); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); aesni_cleanup (); return 0; } void ASM_FUNC_ATTR _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare (); asm volatile ("movdqu %[iv], %%xmm0\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { do_aesni_enc (ctx); asm volatile ("movdqu %[inbuf], %%xmm1\n\t" "pxor %%xmm1, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); } void ASM_FUNC_ATTR _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int cbc_mac) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); asm volatile ("movdqu %[iv], %%xmm5\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" : /* No output */ : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("movdqa %%xmm0, %%xmm5\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } void ASM_FUNC_ATTR _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */ "movdqa %[ctr], %%xmm5\n\t" /* Preload CTR */ : /* No output */ : [mask] "m" (*be_mask), [ctr] "m" (*ctr) : "memory"); #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); for ( ;nblocks >= 8 ; nblocks -= 8 ) { do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { do_aesni_ctr (ctx, ctr, outbuf, inbuf); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } aesni_cleanup (); aesni_cleanup_2_7 (); } unsigned int ASM_FUNC_ATTR _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { aesni_prepare (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_aesni_dec (ctx); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); aesni_cleanup (); return 0; } void ASM_FUNC_ATTR _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); asm volatile ("movdqu %[iv], %%xmm6\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); /* CFB decryption can be parallelized */ #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); for ( ;nblocks >= 8; nblocks -= 8) { asm volatile ("movdqa (%[key]), %%xmm0\n\t" "movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */ "movdqu 0*16(%[inbuf]), %%xmm2\n\t" "movdqu 1*16(%[inbuf]), %%xmm3\n\t" "movdqu 2*16(%[inbuf]), %%xmm4\n\t" "movdqu 3*16(%[inbuf]), %%xmm8\n\t" "movdqu 4*16(%[inbuf]), %%xmm9\n\t" "movdqu 5*16(%[inbuf]), %%xmm10\n\t" "movdqu 6*16(%[inbuf]), %%xmm11\n\t" "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */ "movdqa %%xmm2, %%xmm12\n\t" "movdqa %%xmm3, %%xmm13\n\t" "movdqa %%xmm4, %%xmm14\n\t" "movdqa %%xmm8, %%xmm15\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ : /* No output */ : [inbuf] "r" (inbuf), [key] "r" (ctx->keyschenc) : "memory"); do_aesni_enc_vec8 (ctx); asm volatile ( "pxor %%xmm0, %%xmm12\n\t" "pxor %%xmm0, %%xmm13\n\t" "pxor %%xmm0, %%xmm14\n\t" "pxor %%xmm0, %%xmm15\n\t" "aesenclast %%xmm12, %%xmm1\n\t" "aesenclast %%xmm13, %%xmm2\n\t" "aesenclast %%xmm14, %%xmm3\n\t" "aesenclast %%xmm15, %%xmm4\n\t" "movdqu 4*16(%[inbuf]), %%xmm12\n\t" "movdqu 5*16(%[inbuf]), %%xmm13\n\t" "movdqu 6*16(%[inbuf]), %%xmm14\n\t" "movdqu 7*16(%[inbuf]), %%xmm15\n\t" "pxor %%xmm0, %%xmm12\n\t" "pxor %%xmm0, %%xmm13\n\t" "pxor %%xmm0, %%xmm14\n\t" "pxor %%xmm0, %%xmm15\n\t" "aesenclast %%xmm12, %%xmm8\n\t" "aesenclast %%xmm13, %%xmm9\n\t" "aesenclast %%xmm14, %%xmm10\n\t" "aesenclast %%xmm15, %%xmm11\n\t" "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" "movdqu %%xmm8, 4*16(%[outbuf])\n\t" "movdqu %%xmm9, 5*16(%[outbuf])\n\t" "movdqu %%xmm10, 6*16(%[outbuf])\n\t" "movdqu %%xmm11, 7*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4; nblocks -= 4) { asm volatile ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */ "movdqu 0*16(%[inbuf]), %%xmm2\n\t" "movdqu 1*16(%[inbuf]), %%xmm3\n\t" "movdqu 2*16(%[inbuf]), %%xmm4\n\t" "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */ : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu 0*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "movdqu 1*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "movdqu 2*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "movdqu 3*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc"); for ( ;nblocks; nblocks-- ) { do_aesni_enc (ctx); asm volatile ("movdqa %%xmm0, %%xmm6\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" "movdqu %%xmm6, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } void ASM_FUNC_ATTR _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); if ( !ctx->decryption_prepared ) { do_aesni_prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } asm volatile ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */ : /* No output */ : [iv] "m" (*iv) : "memory"); #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); for ( ;nblocks >= 8 ; nblocks -= 8 ) { asm volatile ("movdqa (%[key]), %%xmm0\n\t" "movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ "movdqu 1*16(%[inbuf]), %%xmm2\n\t" "movdqu 2*16(%[inbuf]), %%xmm3\n\t" "movdqu 3*16(%[inbuf]), %%xmm4\n\t" "movdqu 4*16(%[inbuf]), %%xmm8\n\t" "movdqu 5*16(%[inbuf]), %%xmm9\n\t" "movdqu 6*16(%[inbuf]), %%xmm10\n\t" "movdqu 7*16(%[inbuf]), %%xmm11\n\t" "movdqa %%xmm1, %%xmm12\n\t" "movdqa %%xmm2, %%xmm13\n\t" "movdqa %%xmm3, %%xmm14\n\t" "movdqa %%xmm4, %%xmm15\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ : /* No output */ : [inbuf] "r" (inbuf), [key] "r" (ctx->keyschdec) : "memory"); do_aesni_dec_vec8 (ctx); asm volatile ( "pxor %%xmm0, %%xmm5\n\t" /* xor IV with key */ "pxor %%xmm0, %%xmm12\n\t" /* xor IV with key */ "pxor %%xmm0, %%xmm13\n\t" /* xor IV with key */ "pxor %%xmm0, %%xmm14\n\t" /* xor IV with key */ "pxor %%xmm0, %%xmm15\n\t" /* xor IV with key */ "aesdeclast %%xmm5, %%xmm1\n\t" "aesdeclast %%xmm12, %%xmm2\n\t" "aesdeclast %%xmm13, %%xmm3\n\t" "aesdeclast %%xmm14, %%xmm4\n\t" "movdqu 4*16(%[inbuf]), %%xmm12\n\t" "movdqu 5*16(%[inbuf]), %%xmm13\n\t" "movdqu 6*16(%[inbuf]), %%xmm14\n\t" "movdqu 7*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm0, %%xmm12\n\t" /* xor IV with key */ "pxor %%xmm0, %%xmm13\n\t" /* xor IV with key */ "pxor %%xmm0, %%xmm14\n\t" /* xor IV with key */ "aesdeclast %%xmm15, %%xmm8\n\t" "aesdeclast %%xmm12, %%xmm9\n\t" "aesdeclast %%xmm13, %%xmm10\n\t" "aesdeclast %%xmm14, %%xmm11\n\t" "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" "movdqu %%xmm8, 4*16(%[outbuf])\n\t" "movdqu %%xmm9, 5*16(%[outbuf])\n\t" "movdqu %%xmm10, 6*16(%[outbuf])\n\t" "movdqu %%xmm11, 7*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { asm volatile ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ "movdqu 1*16(%[inbuf]), %%xmm2\n\t" "movdqu 2*16(%[inbuf]), %%xmm3\n\t" "movdqu 3*16(%[inbuf]), %%xmm4\n\t" : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_dec_vec4 (ctx); asm volatile ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */ "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */ "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */ "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */ "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm4, 3*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "movdqa %%xmm0, %%xmm2\n\t" /* use xmm2 as savebuf */ : /* No output */ : [inbuf] "m" (*inbuf) : "memory"); /* uses only xmm0 and xmm1 */ do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */ "movdqu %%xmm0, %[outbuf]\n\t" "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */ : [outbuf] "=m" (*outbuf) : : "memory"); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[iv]\n\t" /* store IV */ : /* No output */ : [iv] "m" (*iv) : "memory"); aesni_cleanup (); aesni_cleanup_2_7 (); } static ASM_FUNC_ATTR_INLINE void aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; /* Calculate checksum */ asm volatile ("movdqu %[checksum], %%xmm6\n\t" "pxor %%xmm1, %%xmm1\n\t" "pxor %%xmm2, %%xmm2\n\t" "pxor %%xmm3, %%xmm3\n\t" : :[checksum] "m" (*c->u_ctr.ctr) : "memory" ); if (0) {} #if defined(HAVE_GCC_INLINE_ASM_AVX2) else if (nblocks >= 16 && ctx->use_avx2) { /* Use wider 256-bit registers for fast xoring of plaintext. */ asm volatile ("vzeroupper\n\t" "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" "vpxor %%xmm5, %%xmm5, %%xmm5\n\t" "vpxor %%xmm7, %%xmm7, %%xmm7\n\t" : : : "memory"); for (;nblocks >= 16; nblocks -= 16) { asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t" "vpxor %[ptr1], %%ymm1, %%ymm1\n\t" "vpxor %[ptr2], %%ymm2, %%ymm2\n\t" "vpxor %[ptr3], %%ymm3, %%ymm3\n\t" - "vpxor %[ptr4], %%ymm0, %%ymm0\n\t" - "vpxor %[ptr5], %%ymm4, %%ymm4\n\t" - "vpxor %[ptr6], %%ymm5, %%ymm5\n\t" - "vpxor %[ptr7], %%ymm7, %%ymm7\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)), [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)), [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)), - [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)), - [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)), + [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)) + : "memory" ); + asm volatile ("vpxor %[ptr4], %%ymm0, %%ymm0\n\t" + "vpxor %[ptr5], %%ymm4, %%ymm4\n\t" + "vpxor %[ptr6], %%ymm5, %%ymm5\n\t" + "vpxor %[ptr7], %%ymm7, %%ymm7\n\t" + : + : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)), [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)), [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)), [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2)) : "memory" ); plaintext += BLOCKSIZE * 16; } asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t" "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" "vpxor %%ymm5, %%ymm2, %%ymm2\n\t" "vpxor %%ymm7, %%ymm3, %%ymm3\n\t" "vextracti128 $1, %%ymm6, %%xmm0\n\t" "vextracti128 $1, %%ymm1, %%xmm4\n\t" "vextracti128 $1, %%ymm2, %%xmm5\n\t" "vextracti128 $1, %%ymm3, %%xmm7\n\t" "vpxor %%xmm0, %%xmm6, %%xmm6\n\t" "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" "vpxor %%xmm5, %%xmm2, %%xmm2\n\t" "vpxor %%xmm7, %%xmm3, %%xmm3\n\t" "vzeroupper\n\t" : : : "memory" ); } #endif #if defined(HAVE_GCC_INLINE_ASM_AVX) else if (nblocks >= 16 && ctx->use_avx) { /* Same as AVX2, except using 256-bit floating point instructions. */ asm volatile ("vzeroupper\n\t" "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t" : : : "memory"); for (;nblocks >= 16; nblocks -= 16) { asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t" "vxorpd %[ptr1], %%ymm1, %%ymm1\n\t" "vxorpd %[ptr2], %%ymm2, %%ymm2\n\t" "vxorpd %[ptr3], %%ymm3, %%ymm3\n\t" - "vxorpd %[ptr4], %%ymm0, %%ymm0\n\t" - "vxorpd %[ptr5], %%ymm4, %%ymm4\n\t" - "vxorpd %[ptr6], %%ymm5, %%ymm5\n\t" - "vxorpd %[ptr7], %%ymm7, %%ymm7\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)), [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)), [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)), - [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)), - [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)), + [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)) + : "memory" ); + asm volatile ("vxorpd %[ptr4], %%ymm0, %%ymm0\n\t" + "vxorpd %[ptr5], %%ymm4, %%ymm4\n\t" + "vxorpd %[ptr6], %%ymm5, %%ymm5\n\t" + "vxorpd %[ptr7], %%ymm7, %%ymm7\n\t" + : + : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)), [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)), [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)), [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2)) : "memory" ); plaintext += BLOCKSIZE * 16; } asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t" "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t" "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t" "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t" "vextractf128 $1, %%ymm6, %%xmm0\n\t" "vextractf128 $1, %%ymm1, %%xmm4\n\t" "vextractf128 $1, %%ymm2, %%xmm5\n\t" "vextractf128 $1, %%ymm3, %%xmm7\n\t" "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t" "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t" "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t" "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t" "vzeroupper\n\t" : : : "memory" ); } #endif for (;nblocks >= 4; nblocks -= 4) { asm volatile ("movdqu %[ptr0], %%xmm0\n\t" "movdqu %[ptr1], %%xmm4\n\t" "movdqu %[ptr2], %%xmm5\n\t" "movdqu %[ptr3], %%xmm7\n\t" "pxor %%xmm0, %%xmm6\n\t" "pxor %%xmm4, %%xmm1\n\t" "pxor %%xmm5, %%xmm2\n\t" "pxor %%xmm7, %%xmm3\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)), [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)), [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)), [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE)) : "memory" ); plaintext += BLOCKSIZE * 4; } for (;nblocks >= 1; nblocks -= 1) { asm volatile ("movdqu %[ptr0], %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)) : "memory" ); plaintext += BLOCKSIZE; } asm volatile ("pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm3, %%xmm6\n\t" "movdqu %%xmm6, %[checksum]\n\t" : [checksum] "=m" (*c->u_ctr.ctr) : : "memory" ); } static unsigned int ASM_FUNC_ATTR_NOINLINE aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; const unsigned char *l; byte tmpbuf_store[3 * 16 + 15]; byte *tmpbuf; aesni_prepare_2_7_variable; asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory"); tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15); aesni_prepare (); aesni_prepare_2_7 (); /* Preload Offset */ asm volatile ("movdqu %[iv], %%xmm5\n\t" "movdqu %[ctr], %%xmm7\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv), [ctr] "m" (*c->u_ctr.ctr) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Checksum_i = Checksum_{i-1} xor P_i */ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm0, %%xmm7\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { unsigned char last_xor_first_key_store[16 + 15]; unsigned char *lxf_key; aesni_prepare_8_15_variable; asm volatile ("" : "=r" (lxf_key) : "0" (last_xor_first_key_store) : "memory"); lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15); aesni_prepare_8_15(); asm volatile ("movdqu %[l0], %%xmm6\n\t" "movdqa %[last_key], %%xmm0\n\t" "pxor %[first_key], %%xmm5\n\t" "pxor %[first_key], %%xmm0\n\t" "movdqa %%xmm0, %[lxfkey]\n\t" : [lxfkey] "=m" (*lxf_key) : [l0] "m" (*c->u_mode.ocb.L[0]), [last_key] "m" (ctx->keyschenc[ctx->rounds][0][0]), [first_key] "m" (ctx->keyschenc[0][0][0]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); n += 4; l = aes_ocb_get_l(c, n); /* Checksum_i = Checksum_{i-1} xor P_i */ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */ asm volatile ("movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[inbuf1], %%xmm2\n\t" "movdqu %[inbuf2], %%xmm3\n\t" : : [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)), [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf3], %%xmm4\n\t" "movdqu %[inbuf4], %%xmm8\n\t" "movdqu %[inbuf5], %%xmm9\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)), [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)), [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %[lxfkey], %%xmm0\n\t" "movdqa %%xmm6, %%xmm12\n\t" "pxor %%xmm5, %%xmm12\n\t" "pxor %%xmm1, %%xmm7\n\t" "pxor %%xmm12, %%xmm1\n\t" "pxor %%xmm0, %%xmm12\n\t" "movdqa %%xmm10, %%xmm13\n\t" "pxor %%xmm5, %%xmm13\n\t" "pxor %%xmm2, %%xmm7\n\t" "pxor %%xmm13, %%xmm2\n\t" "pxor %%xmm0, %%xmm13\n\t" "movdqa %%xmm11, %%xmm14\n\t" "pxor %%xmm5, %%xmm14\n\t" "pxor %%xmm3, %%xmm7\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm0, %%xmm14\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm15, %%xmm5\n\t" "pxor %%xmm4, %%xmm7\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqa %%xmm5, %%xmm15\n\t" "pxor %%xmm0, %%xmm15\n\t" "movdqa %%xmm5, %%xmm0\n\t" "pxor %%xmm6, %%xmm0\n\t" "pxor %%xmm8, %%xmm7\n\t" "pxor %%xmm0, %%xmm8\n\t" "pxor %[lxfkey], %%xmm0\n\t" "movdqa %%xmm0, %[tmpbuf0]\n\t" "movdqa %%xmm10, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm9, %%xmm7\n\t" "pxor %%xmm0, %%xmm9\n\t" "pxor %[lxfkey], %%xmm0\n" "movdqa %%xmm0, %[tmpbuf1]\n\t" : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)), [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)) : [lxfkey] "m" (*lxf_key) : "memory" ); asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" "movdqa %%xmm11, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm10, %%xmm7\n\t" "pxor %%xmm0, %%xmm10\n\t" "pxor %[lxfkey], %%xmm0\n\t" "movdqa %%xmm0, %[tmpbuf2]\n\t" : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE)) : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)), [lxfkey] "m" (*lxf_key) : "memory" ); asm volatile ("movdqu %[l7], %%xmm0\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm0, %%xmm5\n\t" "movdqa 0x10(%[key]), %%xmm0\n\t" "movdqu %[inbuf7], %%xmm11\n\t" "pxor %%xmm11, %%xmm7\n\t" "pxor %%xmm5, %%xmm11\n\t" : : [l7] "m" (*l), [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)), [key] "r" (ctx->keyschenc) : "memory" ); asm volatile ("cmpl $12, %[rounds]\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "jb .Ldeclast%=\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "je .Ldeclast%=\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" ".Ldeclast%=:\n\t" : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); asm volatile ("aesenclast %%xmm12, %%xmm1\n\t" "aesenclast %%xmm13, %%xmm2\n\t" "aesenclast %%xmm14, %%xmm3\n\t" "aesenclast %%xmm15, %%xmm4\n\t" "aesenclast %[tmpbuf0],%%xmm8\n\t" "aesenclast %[tmpbuf1],%%xmm9\n\t" "aesenclast %[tmpbuf2],%%xmm10\n\t" - "aesenclast %%xmm5, %%xmm11\n\t" + : + : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), + [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)), + [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)), + [lxfkey] "m" (*lxf_key) + : "memory" ); + asm volatile ("aesenclast %%xmm5, %%xmm11\n\t" "pxor %[lxfkey], %%xmm11\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" - "movdqu %%xmm3, %[outbuf2]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [lxfkey] "m" (*lxf_key) + : "memory" ); + asm volatile ("movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" "movdqu %%xmm8, %[outbuf4]\n\t" - "movdqu %%xmm9, %[outbuf5]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), + [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)) + : + : "memory" ); + asm volatile ("movdqu %%xmm9, %[outbuf5]\n\t" "movdqu %%xmm10, %[outbuf6]\n\t" "movdqu %%xmm11, %[outbuf7]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), - [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), - [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)), - [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)), + : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)), [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)), [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) - : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), - [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)), - [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)), - [lxfkey] "m" (*lxf_key) + : : "memory" ); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } asm volatile ("pxor %[first_key], %%xmm5\n\t" "pxor %%xmm0, %%xmm0\n\t" "movdqu %%xmm0, %[lxfkey]\n\t" : [lxfkey] "=m" (*lxf_key) : [first_key] "m" (ctx->keyschenc[0][0][0]) : "memory" ); aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = aes_ocb_get_l(c, n); /* Checksum_i = Checksum_{i-1} xor P_i */ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm0\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[l0l1], %%xmm3\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm1, %%xmm7\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqa %%xmm0, %[tmpbuf0]\n\t" : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)) : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm2, %%xmm7\n\t" "pxor %%xmm3, %%xmm2\n\t" "movdqa %%xmm3, %[tmpbuf1]\n\t" : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)) : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm3, %%xmm7\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqa %%xmm0, %[tmpbuf2]\n\t" : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE)) : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm6, %%xmm5\n\t" "pxor %%xmm4, %%xmm5\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm4, %%xmm7\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("pxor %[tmpbuf0],%%xmm1\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "pxor %[tmpbuf1],%%xmm2\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" - "pxor %[tmpbuf2],%%xmm3\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), + [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("pxor %[tmpbuf2],%%xmm3\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) - : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), - [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)), - [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)) + : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)) : "memory" ); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Checksum_i = Checksum_{i-1} xor P_i */ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm0, %%xmm7\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" "movdqu %%xmm7, %[ctr]\n\t" : [iv] "=m" (*c->u_iv.iv), [ctr] "=m" (*c->u_ctr.ctr) : : "memory" ); asm volatile ("pxor %%xmm0, %%xmm0\n\t" "movdqa %%xmm0, %[tmpbuf0]\n\t" "movdqa %%xmm0, %[tmpbuf1]\n\t" "movdqa %%xmm0, %[tmpbuf2]\n\t" : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)), [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)), [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE)) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); return 0; } static unsigned int ASM_FUNC_ATTR_NOINLINE aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks_arg) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; const unsigned char *l; size_t nblocks = nblocks_arg; byte tmpbuf_store[3 * 16 + 15]; byte *tmpbuf; aesni_prepare_2_7_variable; asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory"); tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15); aesni_prepare (); aesni_prepare_2_7 (); if ( !ctx->decryption_prepared ) { do_aesni_prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } /* Preload Offset */ asm volatile ("movdqu %[iv], %%xmm5\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { unsigned char last_xor_first_key_store[16 + 15]; unsigned char *lxf_key; aesni_prepare_8_15_variable; asm volatile ("" : "=r" (lxf_key) : "0" (last_xor_first_key_store) : "memory"); lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15); aesni_prepare_8_15(); asm volatile ("movdqu %[l0], %%xmm6\n\t" "movdqa %[last_key], %%xmm0\n\t" "pxor %[first_key], %%xmm5\n\t" "pxor %[first_key], %%xmm0\n\t" "movdqa %%xmm0, %[lxfkey]\n\t" : [lxfkey] "=m" (*lxf_key) : [l0] "m" (*c->u_mode.ocb.L[0]), [last_key] "m" (ctx->keyschdec[ctx->rounds][0][0]), [first_key] "m" (ctx->keyschdec[0][0][0]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */ asm volatile ("movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[inbuf1], %%xmm2\n\t" "movdqu %[inbuf2], %%xmm3\n\t" : : [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)), [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf3], %%xmm4\n\t" "movdqu %[inbuf4], %%xmm8\n\t" "movdqu %[inbuf5], %%xmm9\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)), [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)), [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %[lxfkey], %%xmm0\n\t" "movdqa %%xmm6, %%xmm12\n\t" "pxor %%xmm5, %%xmm12\n\t" "pxor %%xmm12, %%xmm1\n\t" "pxor %%xmm0, %%xmm12\n\t" "movdqa %%xmm10, %%xmm13\n\t" "pxor %%xmm5, %%xmm13\n\t" "pxor %%xmm13, %%xmm2\n\t" "pxor %%xmm0, %%xmm13\n\t" "movdqa %%xmm11, %%xmm14\n\t" "pxor %%xmm5, %%xmm14\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm0, %%xmm14\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm15, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqa %%xmm5, %%xmm15\n\t" "pxor %%xmm0, %%xmm15\n\t" "movdqa %%xmm5, %%xmm0\n\t" "pxor %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm8\n\t" "pxor %[lxfkey], %%xmm0\n\t" "movdqa %%xmm0, %[tmpbuf0]\n\t" "movdqa %%xmm10, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm9\n\t" "pxor %[lxfkey], %%xmm0\n" "movdqa %%xmm0, %[tmpbuf1]\n\t" : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)), [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)) : [lxfkey] "m" (*lxf_key) : "memory" ); asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" "movdqa %%xmm11, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm10\n\t" "pxor %[lxfkey], %%xmm0\n\t" "movdqa %%xmm0, %[tmpbuf2]\n\t" : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE)) : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)), [lxfkey] "m" (*lxf_key) : "memory" ); asm volatile ("movdqu %[l7], %%xmm0\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm0, %%xmm5\n\t" "movdqa 0x10(%[key]), %%xmm0\n\t" "movdqu %[inbuf7], %%xmm11\n\t" "pxor %%xmm5, %%xmm11\n\t" : : [l7] "m" (*l), [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)), [key] "r" (ctx->keyschdec) : "memory" ); asm volatile ("cmpl $12, %[rounds]\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "jb .Ldeclast%=\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "je .Ldeclast%=\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" ".Ldeclast%=:\n\t" : : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); asm volatile ("aesdeclast %%xmm12, %%xmm1\n\t" "aesdeclast %%xmm13, %%xmm2\n\t" "aesdeclast %%xmm14, %%xmm3\n\t" "aesdeclast %%xmm15, %%xmm4\n\t" "aesdeclast %[tmpbuf0],%%xmm8\n\t" "aesdeclast %[tmpbuf1],%%xmm9\n\t" "aesdeclast %[tmpbuf2],%%xmm10\n\t" - "aesdeclast %%xmm5, %%xmm11\n\t" + : + : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), + [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)), + [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("aesdeclast %%xmm5, %%xmm11\n\t" "pxor %[lxfkey], %%xmm11\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" - "movdqu %%xmm3, %[outbuf2]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [lxfkey] "m" (*lxf_key) + : "memory" ); + asm volatile ("movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" "movdqu %%xmm8, %[outbuf4]\n\t" - "movdqu %%xmm9, %[outbuf5]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), + [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)) + : + : "memory" ); + asm volatile ("movdqu %%xmm9, %[outbuf5]\n\t" "movdqu %%xmm10, %[outbuf6]\n\t" "movdqu %%xmm11, %[outbuf7]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), - [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), - [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)), - [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)), + : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)), [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)), [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) - : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), - [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)), - [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)), - [lxfkey] "m" (*lxf_key) + : : "memory" ); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } asm volatile ("pxor %[first_key], %%xmm5\n\t" "pxor %%xmm0, %%xmm0\n\t" "movdqu %%xmm0, %[lxfkey]\n\t" : [lxfkey] "=m" (*lxf_key) : [first_key] "m" (ctx->keyschdec[0][0][0]) : "memory" ); aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm0\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[l0l1], %%xmm3\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqa %%xmm0, %[tmpbuf0]\n\t" : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)) : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm3, %%xmm2\n\t" "movdqa %%xmm3, %[tmpbuf1]\n\t" : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)) : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqa %%xmm0, %[tmpbuf2]\n\t" : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE)) : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm6, %%xmm5\n\t" "pxor %%xmm4, %%xmm5\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_dec_vec4 (ctx); asm volatile ("pxor %[tmpbuf0],%%xmm1\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "pxor %[tmpbuf1],%%xmm2\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" - "pxor %[tmpbuf2],%%xmm3\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), + [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("pxor %[tmpbuf2],%%xmm3\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) - : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)), - [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)), - [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)) + : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)) : "memory" ); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" : [iv] "=m" (*c->u_iv.iv) : : "memory" ); asm volatile ("pxor %%xmm0, %%xmm0\n\t" "movdqa %%xmm0, %[tmpbuf0]\n\t" "movdqa %%xmm0, %[tmpbuf1]\n\t" "movdqa %%xmm0, %[tmpbuf2]\n\t" : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)), [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)), [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE)) : : "memory" ); aesni_ocb_checksum (c, outbuf_arg, nblocks_arg); aesni_cleanup (); aesni_cleanup_2_7 (); return 0; } size_t ASM_FUNC_ATTR _gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { if (encrypt) return aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks); else return aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks); } size_t ASM_FUNC_ATTR _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; u64 n = c->u_mode.ocb.aad_nblocks; const unsigned char *l; aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); /* Preload Offset and Sum */ asm volatile ("movdqu %[iv], %%xmm5\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_mode.ocb.aad_offset), [ctr] "m" (*c->u_mode.ocb.aad_sum) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[abuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm0, %%xmm6\n\t" : : : "memory" ); abuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); asm volatile ("movdqu %[l0], %%xmm7\n\t" "movdqu %[l0l1], %%xmm12\n\t" "movdqu %[l1], %%xmm13\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), [l1] "m" (*c->u_mode.ocb.L[1]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l3], %%xmm0\n\t" "pxor %%xmm13, %%xmm0\n\t" : : [l3] "m" (*l) : "memory" ); n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l7], %%xmm14\n\t" "pxor %%xmm13, %%xmm14\n\t" : : [l7] "m" (*l) : "memory" ); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[abuf0], %%xmm1\n\t" "movdqu %[abuf1], %%xmm2\n\t" "movdqu %[abuf2], %%xmm3\n\t" "movdqu %[abuf3], %%xmm4\n\t" - "movdqu %[abuf4], %%xmm8\n\t" - "movdqu %[abuf5], %%xmm9\n\t" - "movdqu %[abuf6], %%xmm10\n\t" - "movdqu %[abuf7], %%xmm11\n\t" : : [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)), [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)), [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)), - [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)), - [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)), + [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[abuf4], %%xmm8\n\t" + "movdqu %[abuf5], %%xmm9\n\t" + "movdqu %[abuf6], %%xmm10\n\t" + "movdqu %[abuf7], %%xmm11\n\t" + : + : [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)), [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)), [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)), [abuf7] "m" (*(abuf + 7 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm7, %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "pxor %%xmm12, %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "pxor %%xmm13, %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm0, %%xmm5\n\t" "movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm5, %%xmm4\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm7, %%xmm8\n\t" "pxor %%xmm5, %%xmm8\n\t" "pxor %%xmm12, %%xmm9\n\t" "pxor %%xmm5, %%xmm9\n\t" "pxor %%xmm13, %%xmm10\n\t" "pxor %%xmm5, %%xmm10\n\t" "pxor %%xmm14, %%xmm5\n\t" "pxor %%xmm5, %%xmm11\n\t" "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ : : [key] "r" (ctx->keyschenc) : "memory" ); do_aesni_enc_vec8 (ctx); asm volatile ( "aesenclast %%xmm0, %%xmm1\n\t" "aesenclast %%xmm0, %%xmm2\n\t" "aesenclast %%xmm0, %%xmm3\n\t" "aesenclast %%xmm0, %%xmm4\n\t" "aesenclast %%xmm0, %%xmm8\n\t" "aesenclast %%xmm0, %%xmm9\n\t" "aesenclast %%xmm0, %%xmm10\n\t" "aesenclast %%xmm0, %%xmm11\n\t" "pxor %%xmm2, %%xmm1\n\t" "pxor %%xmm3, %%xmm1\n\t" "pxor %%xmm4, %%xmm1\n\t" "pxor %%xmm8, %%xmm1\n\t" "pxor %%xmm9, %%xmm6\n\t" "pxor %%xmm10, %%xmm6\n\t" "pxor %%xmm11, %%xmm6\n\t" "pxor %%xmm1, %%xmm6\n\t" : : : "memory" ); abuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm0\n\t" "movdqu %[abuf0], %%xmm1\n\t" "movdqu %[l0l1], %%xmm3\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm7\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" : : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[abuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm3, %%xmm2\n\t" : : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[abuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" : : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm4, %%xmm5\n\t" "movdqu %[abuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm4, %%xmm6\n\t" : : : "memory" ); abuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[abuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm0, %%xmm6\n\t" : : : "memory" ); abuf += BLOCKSIZE; } c->u_mode.ocb.aad_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_mode.ocb.aad_offset), [ctr] "=m" (*c->u_mode.ocb.aad_sum) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); return 0; } static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) = { 0x87, 0x01 }; static void ASM_FUNC_ATTR _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); /* Preload Tweak */ asm volatile ("movdqu %[tweak], %%xmm5\n\t" "movdqa %[gfmul], %%xmm6\n\t" : : [tweak] "m" (*tweak), [gfmul] "m" (*xts_gfmul_const) : "memory" ); for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf0] "=m" (*(outbuf + 0 * 16)) : [inbuf0] "m" (*(inbuf + 0 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf1] "=m" (*(outbuf + 1 * 16)) : [inbuf1] "m" (*(inbuf + 1 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf2] "=m" (*(outbuf + 2 * 16)) : [inbuf2] "m" (*(inbuf + 2 * 16)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm5, %[outbuf3]\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf3] "=m" (*(outbuf + 3 * 16)) : [inbuf3] "m" (*(inbuf + 3 * 16)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu %[outbuf0], %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %[outbuf1], %%xmm0\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf2], %%xmm1\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %[outbuf3], %%xmm0\n\t" "pxor %%xmm1, %%xmm3\n\t" "pxor %%xmm0, %%xmm4\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * 16)), [outbuf1] "+m" (*(outbuf + 1 * 16)), [outbuf2] "+m" (*(outbuf + 2 * 16)), [outbuf3] "+m" (*(outbuf + 3 * 16)) : : "memory" ); outbuf += BLOCKSIZE * 4; inbuf += BLOCKSIZE * 4; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "movdqa %%xmm5, %%xmm4\n\t" "pshufd $0x13, %%xmm5, %%xmm1\n\t" "psrad $31, %%xmm1\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm1\n\t" "pxor %%xmm1, %%xmm5\n\t" : : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm4, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[tweak]\n\t" : [tweak] "=m" (*tweak) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } static void ASM_FUNC_ATTR _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); if ( !ctx->decryption_prepared ) { do_aesni_prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } /* Preload Tweak */ asm volatile ("movdqu %[tweak], %%xmm5\n\t" "movdqa %[gfmul], %%xmm6\n\t" : : [tweak] "m" (*tweak), [gfmul] "m" (*xts_gfmul_const) : "memory" ); for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf0] "=m" (*(outbuf + 0 * 16)) : [inbuf0] "m" (*(inbuf + 0 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf1] "=m" (*(outbuf + 1 * 16)) : [inbuf1] "m" (*(inbuf + 1 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf2] "=m" (*(outbuf + 2 * 16)) : [inbuf2] "m" (*(inbuf + 2 * 16)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm5, %[outbuf3]\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf3] "=m" (*(outbuf + 3 * 16)) : [inbuf3] "m" (*(inbuf + 3 * 16)) : "memory" ); do_aesni_dec_vec4 (ctx); asm volatile ("movdqu %[outbuf0], %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %[outbuf1], %%xmm0\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf2], %%xmm1\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %[outbuf3], %%xmm0\n\t" "pxor %%xmm1, %%xmm3\n\t" "pxor %%xmm0, %%xmm4\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * 16)), [outbuf1] "+m" (*(outbuf + 1 * 16)), [outbuf2] "+m" (*(outbuf + 2 * 16)), [outbuf3] "+m" (*(outbuf + 3 * 16)) : : "memory" ); outbuf += BLOCKSIZE * 4; inbuf += BLOCKSIZE * 4; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "movdqa %%xmm5, %%xmm4\n\t" "pshufd $0x13, %%xmm5, %%xmm1\n\t" "psrad $31, %%xmm1\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm1\n\t" "pxor %%xmm1, %%xmm5\n\t" : : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm4, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[tweak]\n\t" : [tweak] "=m" (*tweak) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } void ASM_FUNC_ATTR _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int encrypt) { if (encrypt) _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks); else _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks); } #if __clang__ # pragma clang attribute pop #endif #endif /* USE_AESNI */ diff --git a/cipher/sha512-ssse3-i386.c b/cipher/sha512-ssse3-i386.c index 4b12cee4..0fc98d8e 100644 --- a/cipher/sha512-ssse3-i386.c +++ b/cipher/sha512-ssse3-i386.c @@ -1,400 +1,404 @@ /* sha512-ssse3-i386.c - i386/SSSE3 implementation of SHA-512 transform * Copyright (C) 2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * SHA512 Message Expansion (I2 and W2 macros) based on implementation * from file "sha512-ssse3-amd64.s": ************************************************************************ * Copyright (c) 2012, Intel Corporation * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. * * * Neither the name of the Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ************************************************************************ */ #include #if defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512) #include "bufhelp.h" #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */ /* Prevent compiler from issuing SSE/MMX instructions between asm blocks. */ # pragma GCC target("no-sse") # pragma GCC target("no-mmx") #endif #if __clang__ # pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function) # pragma clang attribute push (__attribute__((target("no-mmx"))), apply_to = function) #endif #define ALWAYS_INLINE inline __attribute__((always_inline)) #define NO_INLINE __attribute__((noinline)) #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE static const u64 K[80] __attribute__ ((aligned (16))) = { U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd), U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc), U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019), U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118), U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe), U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2), U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1), U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694), U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3), U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65), U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483), U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5), U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210), U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4), U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725), U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70), U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926), U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df), U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8), U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b), U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001), U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30), U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910), U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8), U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53), U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8), U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb), U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3), U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60), U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec), U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9), U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b), U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207), U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178), U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6), U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b), U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493), U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c), U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a), U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817) }; static const unsigned char bshuf_mask[16] __attribute__ ((aligned (16))) = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; /* SHA2 round */ #define RA "%%mm0" #define RB "%%mm1" #define RC "%%mm2" #define RD "%%mm3" #define RE "%%mm4" #define RF "%%mm5" #define RG "%%mm6" #define RH "%%mm7" #define Rx(a,b,c,d,e,f,g,h,wk) \ asm volatile (/* Cho + Sum1 */ \ "movq2dq "a", %%xmm2;\n\t" \ "movq "e", "a";\n\t" \ "movq2dq "c", %%xmm3;\n\t" \ "movq "e", "c";\n\t" \ "movq2dq "b", %%xmm4;\n\t" \ "movq "e", "b";\n\t" \ "psrlq $(41-18), "c";\n\t" \ "pandn "g", "a";\n\t" \ "pxor "e", "c";\n\t" \ "pand "f", "b";\n\t" \ "psrlq $(18-14), "c";\n\t" \ "paddq "a", "h";\n\t" \ wk(a) \ "pxor "e", "c";\n\t" \ "paddq "b", "h";\n\t" \ "psrlq $(14), "c";\n\t" \ "movq "e", "b";\n\t" \ "psllq $(50-46), "b";\n\t" \ "paddq "a", "h";\n\t" \ "movdq2q %%xmm2, "a";\n\t" \ "pxor "e", "b";\n\t" \ "psllq $(46-23), "b";\n\t" \ "pxor "e", "b";\n\t" \ "psllq $(23), "b";\n\t" \ "pxor "b", "c";\n\t" \ "movdq2q %%xmm4, "b";\n\t" \ "paddq "c", "h";\n\t" \ "movdq2q %%xmm3, "c";\n\t" \ \ /* Maj + Sum0 */ \ "movq2dq "e", %%xmm2;\n\t" \ "movq "a", "e";\n\t" \ "movq2dq "g", %%xmm3;\n\t" \ "movq "a", "g";\n\t" \ "movq2dq "f", %%xmm4;\n\t" \ "movq "a", "f";\n\t" \ "psrlq $(39-34), "g";\n\t" \ "pxor "b", "e";\n\t" \ "pxor "a", "g";\n\t" \ "pand "b", "f";\n\t" \ "psrlq $(34-28), "g";\n\t" \ "pand "c", "e";\n\t" \ "pxor "a", "g";\n\t" \ "paddq "h", "d";\n\t" \ "paddq "f", "h";\n\t" \ "movdq2q %%xmm4, "f";\n\t" \ "psrlq $28, "g";\n\t" \ "paddq "e", "h";\n\t" \ "movq "a", "e";\n\t" \ "psllq $(36-30), "e";\n\t" \ "pxor "a", "e";\n\t" \ "psllq $(30-25), "e";\n\t" \ "pxor "a", "e";\n\t" \ "psllq $(25), "e";\n\t" \ "pxor "e", "g";\n\t" \ "movdq2q %%xmm2, "e";\n\t" \ "paddq "g", "h";\n\t" \ "movdq2q %%xmm3, "g";\n\t" \ \ : \ : \ : "memory" ) #define WK0(tmp) "movdq2q %%xmm0, "tmp";\n\t" \ "pshufd $0xee, %%xmm0, %%xmm0;\n\t" #define WK1(tmp) "movdq2q %%xmm0, "tmp";\n\t" /* Message expansion */ #define I2(i) \ asm volatile ("movdqu %[inbuf], %%xmm0;\n\t" \ "pshufb %%xmm6, %%xmm0;\n\t" \ "movdqu %%xmm0, %[w];\n\t" \ "paddq %[k], %%xmm0;\n\t" \ : \ : [k] "m" (K[i]), \ [w] "m" (w[i]), \ [inbuf] "m" (data[(i)*8]) \ : "memory" ) #define W2(i) \ asm volatile ("movdqu %[w_t_m_2], %%xmm2;\n\t" \ "movdqa %%xmm2, %%xmm0;\n\t" \ "movdqu %[w_t_m_15], %%xmm5;\n\t" \ - "movdqa %%xmm5, %%xmm3;\n\t" \ + : \ + : [w_t_m_2] "m" (w[(i)-2]), \ + [w_t_m_15] "m" (w[(i)-15]) \ + : "memory" ); \ + asm volatile ("movdqa %%xmm5, %%xmm3;\n\t" \ "psrlq $(61-19), %%xmm0;\n\t" \ "psrlq $(8-7), %%xmm3;\n\t" \ "pxor %%xmm2, %%xmm0;\n\t" \ "pxor %%xmm5, %%xmm3;\n\t" \ "psrlq $(19-6), %%xmm0;\n\t" \ "psrlq $(7-1), %%xmm3;\n\t" \ "pxor %%xmm2, %%xmm0;\n\t" \ "pxor %%xmm5, %%xmm3;\n\t" \ "psrlq $6, %%xmm0;\n\t" \ "psrlq $1, %%xmm3;\n\t" \ "movdqa %%xmm2, %%xmm1;\n\t" \ "movdqa %%xmm5, %%xmm4;\n\t" \ "psllq $(61-19), %%xmm1;\n\t" \ "psllq $(8-1), %%xmm4;\n\t" \ "pxor %%xmm2, %%xmm1;\n\t" \ "pxor %%xmm5, %%xmm4;\n\t" \ "psllq $(64-61), %%xmm1;\n\t" \ "psllq $(64-8), %%xmm4;\n\t" \ "pxor %%xmm1, %%xmm0;\n\t" \ "movdqu %[w_t_m_16], %%xmm2;\n\t" \ "pxor %%xmm4, %%xmm3;\n\t" \ "movdqu %[w_t_m_7], %%xmm1;\n\t" \ - "paddq %%xmm3, %%xmm0;\n\t" \ + : \ + : [w_t_m_7] "m" (w[(i)-7]), \ + [w_t_m_16] "m" (w[(i)-16]) \ + : "memory" ); \ + asm volatile ("paddq %%xmm3, %%xmm0;\n\t" \ "paddq %%xmm2, %%xmm0;\n\t" \ "paddq %%xmm1, %%xmm0;\n\t" \ "movdqu %%xmm0, %[w_t_m_0];\n\t" \ "paddq %[k], %%xmm0;\n\t" \ : [w_t_m_0] "=m" (w[(i)-0]) \ - : [k] "m" (K[i]), \ - [w_t_m_2] "m" (w[(i)-2]), \ - [w_t_m_7] "m" (w[(i)-7]), \ - [w_t_m_15] "m" (w[(i)-15]), \ - [w_t_m_16] "m" (w[(i)-16]) \ + : [k] "m" (K[i]) \ : "memory" ) unsigned int ASM_FUNC_ATTR _gcry_sha512_transform_i386_ssse3(u64 state[8], const unsigned char *data, size_t nblks) { unsigned int t; u64 w[80]; /* Load state to MMX registers. */ asm volatile ("movq 8*0(%[state]), "RA";\n\t" "movq 8*1(%[state]), "RB";\n\t" "movq 8*2(%[state]), "RC";\n\t" "movq 8*3(%[state]), "RD";\n\t" "movq 8*4(%[state]), "RE";\n\t" "movq 8*5(%[state]), "RF";\n\t" "movq 8*6(%[state]), "RG";\n\t" "movq 8*7(%[state]), "RH";\n\t" : : [state] "r" (state) : "memory" ); asm volatile ("movdqa %[bshuf_mask], %%xmm6;\n\t" : : [bshuf_mask] "m" (*bshuf_mask) : "memory" ); while (nblks) { I2(0); Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0); Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1); I2(2); Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0); Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1); I2(4); Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0); Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1); I2(6); Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0); Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1); I2(8); Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0); Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1); I2(10); Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0); Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1); I2(12); Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0); Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1); I2(14); Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0); Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1); data += 128; for (t = 16; t < 80; t += 16) { W2(t + 0); Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0); Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1); W2(t + 2); Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0); Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1); W2(t + 4); Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0); Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1); W2(t + 6); Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0); Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1); W2(t + 8); Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0); Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1); W2(t + 10); Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0); Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1); W2(t + 12); Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0); Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1); W2(t + 14); Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0); Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1); } asm volatile ("paddq 8*0(%[state]), "RA";\n\t" "paddq 8*1(%[state]), "RB";\n\t" "paddq 8*2(%[state]), "RC";\n\t" "paddq 8*3(%[state]), "RD";\n\t" "paddq 8*4(%[state]), "RE";\n\t" "paddq 8*5(%[state]), "RF";\n\t" "paddq 8*6(%[state]), "RG";\n\t" "paddq 8*7(%[state]), "RH";\n\t" "movq "RA", 8*0(%[state]);\n\t" "movq "RB", 8*1(%[state]);\n\t" "movq "RC", 8*2(%[state]);\n\t" "movq "RD", 8*3(%[state]);\n\t" "movq "RE", 8*4(%[state]);\n\t" "movq "RF", 8*5(%[state]);\n\t" "movq "RG", 8*6(%[state]);\n\t" "movq "RH", 8*7(%[state]);\n\t" : : [state] "r" (state) : "memory" ); nblks--; } /* Clear registers */ asm volatile ("pxor %%xmm0, %%xmm0;\n\t" "pxor %%xmm1, %%xmm1;\n\t" "pxor %%xmm2, %%xmm2;\n\t" "pxor %%xmm3, %%xmm3;\n\t" "pxor %%xmm4, %%xmm4;\n\t" "pxor %%xmm5, %%xmm5;\n\t" "pxor %%xmm6, %%xmm6;\n\t" "pxor %%mm0, %%mm0;\n\t" "pxor %%mm1, %%mm1;\n\t" "pxor %%mm2, %%mm2;\n\t" "pxor %%mm3, %%mm3;\n\t" "pxor %%mm4, %%mm4;\n\t" "pxor %%mm5, %%mm5;\n\t" "pxor %%mm6, %%mm6;\n\t" "pxor %%mm7, %%mm7;\n\t" "emms;\n\t" : : : "memory" ); return sizeof(w); } #if __clang__ # pragma clang attribute pop # pragma clang attribute pop #endif #endif