diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 79de140d..5ece774e 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -1,764 +1,763 @@ /* cipher-internal.h - Internal defs for cipher.c * Copyright (C) 2011 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef G10_CIPHER_INTERNAL_H #define G10_CIPHER_INTERNAL_H #include "./poly1305-internal.h" /* The maximum supported size of a block in bytes. */ #define MAX_BLOCKSIZE 16 /* The length for an OCB block. Although OCB supports any block length it does not make sense to use a 64 bit blocklen (and cipher) because this reduces the security margin to an unacceptable state. Thus we require a cipher with 128 bit blocklength. */ #define OCB_BLOCK_LEN (128/8) /* The size of the pre-computed L table for OCB. This takes the same size as the table used for GCM and thus we don't save anything by not using such a table. */ #define OCB_L_TABLE_SIZE 16 /* Check the above constants. */ #if OCB_BLOCK_LEN > MAX_BLOCKSIZE # error OCB_BLOCKLEN > MAX_BLOCKSIZE #endif /* Magic values for the context structure. */ #define CTX_MAGIC_NORMAL 0x24091964 #define CTX_MAGIC_SECURE 0x46919042 /* Try to use 16 byte aligned cipher context for better performance. We use the aligned attribute, thus it is only possible to implement this with gcc. */ #undef NEED_16BYTE_ALIGNED_CONTEXT #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED # define NEED_16BYTE_ALIGNED_CONTEXT 1 #endif /* Undef this symbol to trade GCM speed for 256 bytes of memory per context */ #define GCM_USE_TABLES 1 /* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL code. */ #undef GCM_USE_INTEL_PCLMUL #if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES) # if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)) # if __GNUC__ >= 4 # define GCM_USE_INTEL_PCLMUL 1 # endif # endif #endif /* GCM_USE_INTEL_PCLMUL */ /* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */ #undef GCM_USE_ARM_PMULL #if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES) # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) # define GCM_USE_ARM_PMULL 1 # elif defined(__AARCH64EL__) && \ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) # define GCM_USE_ARM_PMULL 1 # endif #endif /* GCM_USE_ARM_PMULL */ typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks); /* A VIA processor with the Padlock engine as well as the Intel AES_NI instructions require an alignment of most data on a 16 byte boundary. Because we trick out the compiler while allocating the context, the align attribute as used in rijndael.c does not work on its own. Thus we need to make sure that the entire context structure is a aligned on that boundary. We achieve this by defining a new type and use that instead of our usual alignment type. */ typedef union { PROPERLY_ALIGNED_TYPE foo; #ifdef NEED_16BYTE_ALIGNED_CONTEXT char bar[16] __attribute__ ((aligned (16))); #endif char c[1]; } cipher_context_alignment_t; /* Storage structure for CMAC, for CMAC and EAX modes. */ typedef struct { /* The initialization vector. Also contains tag after finalization. */ union { cipher_context_alignment_t iv_align; unsigned char iv[MAX_BLOCKSIZE]; } u_iv; /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */ unsigned char subkeys[2][MAX_BLOCKSIZE]; /* Space to save partial input lengths for MAC. */ unsigned char macbuf[MAX_BLOCKSIZE]; int mac_unused; /* Number of unprocessed bytes in MACBUF. */ unsigned int tag:1; /* Set to 1 if tag has been finalized. */ } gcry_cmac_context_t; /* The handle structure. */ struct gcry_cipher_handle { int magic; size_t actual_handle_size; /* Allocated size of this handle. */ size_t handle_offset; /* Offset to the malloced block. */ gcry_cipher_spec_t *spec; /* The algorithm id. This is a hack required because the module interface does not easily allow to retrieve this value. */ int algo; /* A structure with function pointers for mode operations. */ struct { gcry_err_code_t (*encrypt)(gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t (*decrypt)(gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t (*setiv)(gcry_cipher_hd_t c, const unsigned char *iv, size_t ivlen); gcry_err_code_t (*authenticate)(gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t (*get_tag)(gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t (*check_tag)(gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); } mode_ops; /* A structure with function pointers for bulk operations. Due to limitations of the module system (we don't want to change the API) we need to keep these function pointers here. The cipher open function initializes them and the actual encryption routines use them if they are not NULL. */ struct { void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*cbc_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); void (*cbc_dec)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*ctr_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); size_t (*ocb_crypt)(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); void (*xts_crypt)(void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); } bulk; int mode; unsigned int flags; struct { unsigned int key:1; /* Set to 1 if a key has been set. */ unsigned int iv:1; /* Set to 1 if a IV has been set. */ unsigned int tag:1; /* Set to 1 if a tag is finalized. */ unsigned int finalize:1; /* Next encrypt/decrypt has the final data. */ } marks; /* The initialization vector. For best performance we make sure that it is properly aligned. In particular some implementations of bulk operations expect an 16 byte aligned IV. IV is also used to store CBC-MAC in CCM mode; counter IV is stored in U_CTR. For OCB mode it is used for the offset value. */ union { cipher_context_alignment_t iv_align; unsigned char iv[MAX_BLOCKSIZE]; } u_iv; /* The counter for CTR mode. This field is also used by AESWRAP and thus we can't use the U_IV union. For OCB mode it is used for the checksum. */ union { cipher_context_alignment_t iv_align; unsigned char ctr[MAX_BLOCKSIZE]; } u_ctr; /* Space to save an IV or CTR for chaining operations. */ unsigned char lastiv[MAX_BLOCKSIZE]; int unused; /* Number of unused bytes in LASTIV. */ union { /* Mode specific storage for CCM mode. */ struct { u64 encryptlen; u64 aadlen; unsigned int authlen; /* Space to save partial input lengths for MAC. */ unsigned char macbuf[GCRY_CCM_BLOCK_LEN]; int mac_unused; /* Number of unprocessed bytes in MACBUF. */ unsigned char s0[GCRY_CCM_BLOCK_LEN]; unsigned int nonce:1; /* Set to 1 if nonce has been set. */ unsigned int lengths:1; /* Set to 1 if CCM length parameters has been processed. */ } ccm; /* Mode specific storage for Poly1305 mode. */ struct { /* byte counter for AAD. */ u32 aadcount[2]; /* byte counter for data. */ u32 datacount[2]; unsigned int aad_finalized:1; unsigned int bytecount_over_limits:1; poly1305_context_t ctx; } poly1305; /* Mode specific storage for CMAC mode. */ gcry_cmac_context_t cmac; /* Mode specific storage for EAX mode. */ struct { /* CMAC for header (AAD). */ gcry_cmac_context_t cmac_header; /* CMAC for ciphertext. */ gcry_cmac_context_t cmac_ciphertext; } eax; /* Mode specific storage for GCM mode. */ struct { /* The interim tag for GCM mode. */ union { cipher_context_alignment_t iv_align; unsigned char tag[MAX_BLOCKSIZE]; } u_tag; /* Space to save partial input lengths for MAC. */ unsigned char macbuf[GCRY_CCM_BLOCK_LEN]; int mac_unused; /* Number of unprocessed bytes in MACBUF. */ /* byte counters for GCM */ u32 aadlen[2]; u32 datalen[2]; /* encrypted tag counter */ unsigned char tagiv[MAX_BLOCKSIZE]; unsigned int ghash_data_finalized:1; unsigned int ghash_aad_finalized:1; unsigned int datalen_over_limits:1; unsigned int disallow_encryption_because_of_setiv_in_fips_mode:1; /* --- Following members are not cleared in gcry_cipher_reset --- */ /* GHASH multiplier from key. */ union { cipher_context_alignment_t iv_align; unsigned char key[MAX_BLOCKSIZE]; } u_ghash_key; /* GHASH implementation in use. */ ghash_fn_t ghash_fn; /* Pre-calculated table for GCM. */ #ifdef GCM_USE_TABLES #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__)) #define GCM_TABLES_USE_U64 1 u64 gcm_table[2 * 16]; #else #undef GCM_TABLES_USE_U64 u32 gcm_table[4 * 16]; #endif #endif } gcm; /* Mode specific storage for OCB mode. */ struct { /* --- Following members are not cleared in gcry_cipher_reset --- */ /* Helper variables and pre-computed table of L values. */ unsigned char L_star[OCB_BLOCK_LEN]; unsigned char L_dollar[OCB_BLOCK_LEN]; unsigned char L0L1[OCB_BLOCK_LEN]; - unsigned char L0L1L0[OCB_BLOCK_LEN]; unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN]; /* --- Following members are cleared in gcry_cipher_reset --- */ /* The tag is valid if marks.tag has been set. */ unsigned char tag[OCB_BLOCK_LEN]; /* A buffer to hold the offset for the AAD processing. */ unsigned char aad_offset[OCB_BLOCK_LEN]; /* A buffer to hold the current sum of AAD processing. We can't use tag here because tag may already hold the preprocessed checksum of the data. */ unsigned char aad_sum[OCB_BLOCK_LEN]; /* A buffer to store AAD data not yet processed. */ unsigned char aad_leftover[OCB_BLOCK_LEN]; /* Number of data/aad blocks processed so far. */ u64 data_nblocks; u64 aad_nblocks; /* Number of valid bytes in AAD_LEFTOVER. */ unsigned char aad_nleftover; /* Length of the tag. Fixed for now but may eventually be specified using a set of gcry_cipher_flags. */ unsigned char taglen; /* Flags indicating that the final data/aad block has been processed. */ unsigned int data_finalized:1; unsigned int aad_finalized:1; } ocb; /* Mode specific storage for XTS mode. */ struct { /* Pointer to tweak cipher context, allocated after actual * cipher context. */ char *tweak_context; } xts; } u_mode; /* What follows are two contexts of the cipher in use. The first one needs to be aligned well enough for the cipher operation whereas the second one is a copy created by cipher_setkey and used by cipher_reset. That second copy has no need for proper aligment because it is only accessed by memcpy. */ cipher_context_alignment_t context; }; /*-- cipher-cbc.c --*/ gcry_err_code_t _gcry_cipher_cbc_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cbc_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cbc_cts_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cbc_cts_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-cfb.c --*/ gcry_err_code_t _gcry_cipher_cfb_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cfb_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cfb8_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cfb8_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-ofb.c --*/ gcry_err_code_t _gcry_cipher_ofb_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-ctr.c --*/ gcry_err_code_t _gcry_cipher_ctr_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-aeswrap.c --*/ gcry_err_code_t _gcry_cipher_aeswrap_encrypt /* */ (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, const byte *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_aeswrap_decrypt /* */ (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, const byte *inbuf, size_t inbuflen); /*-- cipher-ccm.c --*/ gcry_err_code_t _gcry_cipher_ccm_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ccm_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ccm_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_ccm_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t _gcry_cipher_ccm_set_lengths /* */ (gcry_cipher_hd_t c, u64 encryptedlen, u64 aadlen, u64 taglen); gcry_err_code_t _gcry_cipher_ccm_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_ccm_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); /*-- cipher-cmac.c --*/ gcry_err_code_t _gcry_cmac_generate_subkeys /* */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx); gcry_err_code_t _gcry_cmac_write /* */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx, const byte * inbuf, size_t inlen); gcry_err_code_t _gcry_cmac_final /* */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx); void _gcry_cmac_reset (gcry_cmac_context_t *ctx); /*-- cipher-eax.c --*/ gcry_err_code_t _gcry_cipher_eax_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_eax_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_eax_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_eax_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *aadbuf, size_t aadbuflen); gcry_err_code_t _gcry_cipher_eax_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_eax_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); gcry_err_code_t _gcry_cipher_eax_setkey /* */ (gcry_cipher_hd_t c); /*-- cipher-gcm.c --*/ gcry_err_code_t _gcry_cipher_gcm_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_gcm_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_gcm_setiv /* */ (gcry_cipher_hd_t c, const unsigned char *iv, size_t ivlen); gcry_err_code_t _gcry_cipher_gcm_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *aadbuf, size_t aadbuflen); gcry_err_code_t _gcry_cipher_gcm_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_gcm_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); void _gcry_cipher_gcm_setkey /* */ (gcry_cipher_hd_t c); /*-- cipher-poly1305.c --*/ gcry_err_code_t _gcry_cipher_poly1305_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_poly1305_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_poly1305_setiv /* */ (gcry_cipher_hd_t c, const unsigned char *iv, size_t ivlen); gcry_err_code_t _gcry_cipher_poly1305_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *aadbuf, size_t aadbuflen); gcry_err_code_t _gcry_cipher_poly1305_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_poly1305_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); void _gcry_cipher_poly1305_setkey /* */ (gcry_cipher_hd_t c); /*-- chacha20.c --*/ gcry_err_code_t _gcry_chacha20_poly1305_encrypt /* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length); gcry_err_code_t _gcry_chacha20_poly1305_decrypt /* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length); /*-- cipher-ocb.c --*/ gcry_err_code_t _gcry_cipher_ocb_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ocb_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ocb_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_ocb_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t _gcry_cipher_ocb_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_ocb_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); void _gcry_cipher_ocb_setkey /* */ (gcry_cipher_hd_t c); /*-- cipher-xts.c --*/ gcry_err_code_t _gcry_cipher_xts_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_xts_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /* Return the L-value for block N. Note: 'cipher_ocb.c' ensures that N * will never be multiple of 65536 (1 << OCB_L_TABLE_SIZE), thus N can * be directly passed to _gcry_ctz() function and resulting index will * never overflow the table. */ static inline const unsigned char * ocb_get_l (gcry_cipher_hd_t c, u64 n) { unsigned long ntz; #if ((defined(__i386__) || defined(__x86_64__)) && __GNUC__ >= 4) /* Assumes that N != 0. */ asm ("rep;bsfl %k[low], %k[ntz]\n\t" : [ntz] "=r" (ntz) : [low] "r" ((unsigned long)n) : "cc"); #else ntz = _gcry_ctz (n); #endif return c->u_mode.ocb.L[ntz]; } /* Return bit-shift of blocksize. */ static inline unsigned int _gcry_blocksize_shift(gcry_cipher_hd_t c) { /* Only blocksizes 8 and 16 are used. Return value in such way * that compiler can optimize calling functions based on this. */ return c->spec->blocksize == 8 ? 3 : 4; } /* Optimized function for cipher block copying */ static inline void cipher_block_cpy(void *_dst, const void *_src, size_t blocksize) { byte *dst = _dst; const byte *src = _src; u64 s[2]; if (blocksize == 8) { buf_put_he64(dst + 0, buf_get_he64(src + 0)); } else /* blocksize == 16 */ { s[0] = buf_get_he64(src + 0); s[1] = buf_get_he64(src + 8); buf_put_he64(dst + 0, s[0]); buf_put_he64(dst + 8, s[1]); } } /* Optimized function for cipher block xoring */ static inline void cipher_block_xor(void *_dst, const void *_src1, const void *_src2, size_t blocksize) { byte *dst = _dst; const byte *src1 = _src1; const byte *src2 = _src2; u64 s1[2]; u64 s2[2]; if (blocksize == 8) { buf_put_he64(dst + 0, buf_get_he64(src1 + 0) ^ buf_get_he64(src2 + 0)); } else /* blocksize == 16 */ { s1[0] = buf_get_he64(src1 + 0); s1[1] = buf_get_he64(src1 + 8); s2[0] = buf_get_he64(src2 + 0); s2[1] = buf_get_he64(src2 + 8); buf_put_he64(dst + 0, s1[0] ^ s2[0]); buf_put_he64(dst + 8, s1[1] ^ s2[1]); } } /* Optimized function for in-place cipher block xoring */ static inline void cipher_block_xor_1(void *_dst, const void *_src, size_t blocksize) { cipher_block_xor (_dst, _dst, _src, blocksize); } /* Optimized function for cipher block xoring with two destination cipher blocks. Used mainly by CFB mode encryption. */ static inline void cipher_block_xor_2dst(void *_dst1, void *_dst2, const void *_src, size_t blocksize) { byte *dst1 = _dst1; byte *dst2 = _dst2; const byte *src = _src; u64 d2[2]; u64 s[2]; if (blocksize == 8) { d2[0] = buf_get_he64(dst2 + 0) ^ buf_get_he64(src + 0); buf_put_he64(dst2 + 0, d2[0]); buf_put_he64(dst1 + 0, d2[0]); } else /* blocksize == 16 */ { s[0] = buf_get_he64(src + 0); s[1] = buf_get_he64(src + 8); d2[0] = buf_get_he64(dst2 + 0); d2[1] = buf_get_he64(dst2 + 8); d2[0] = d2[0] ^ s[0]; d2[1] = d2[1] ^ s[1]; buf_put_he64(dst2 + 0, d2[0]); buf_put_he64(dst2 + 8, d2[1]); buf_put_he64(dst1 + 0, d2[0]); buf_put_he64(dst1 + 8, d2[1]); } } /* Optimized function for combined cipher block xoring and copying. Used by mainly CBC mode decryption. */ static inline void cipher_block_xor_n_copy_2(void *_dst_xor, const void *_src_xor, void *_srcdst_cpy, const void *_src_cpy, size_t blocksize) { byte *dst_xor = _dst_xor; byte *srcdst_cpy = _srcdst_cpy; const byte *src_xor = _src_xor; const byte *src_cpy = _src_cpy; u64 sc[2]; u64 sx[2]; u64 sdc[2]; if (blocksize == 8) { sc[0] = buf_get_he64(src_cpy + 0); buf_put_he64(dst_xor + 0, buf_get_he64(srcdst_cpy + 0) ^ buf_get_he64(src_xor + 0)); buf_put_he64(srcdst_cpy + 0, sc[0]); } else /* blocksize == 16 */ { sc[0] = buf_get_he64(src_cpy + 0); sc[1] = buf_get_he64(src_cpy + 8); sx[0] = buf_get_he64(src_xor + 0); sx[1] = buf_get_he64(src_xor + 8); sdc[0] = buf_get_he64(srcdst_cpy + 0); sdc[1] = buf_get_he64(srcdst_cpy + 8); sx[0] ^= sdc[0]; sx[1] ^= sdc[1]; buf_put_he64(dst_xor + 0, sx[0]); buf_put_he64(dst_xor + 8, sx[1]); buf_put_he64(srcdst_cpy + 0, sc[0]); buf_put_he64(srcdst_cpy + 8, sc[1]); } } /* Optimized function for combined cipher block xoring and copying. Used by mainly CFB mode decryption. */ static inline void cipher_block_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src, size_t blocksize) { cipher_block_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, blocksize); } #endif /*G10_CIPHER_INTERNAL_H*/ diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c index be6b8dff..308b0495 100644 --- a/cipher/cipher-ocb.c +++ b/cipher/cipher-ocb.c @@ -1,752 +1,750 @@ /* cipher-ocb.c - OCB cipher mode * Copyright (C) 2015, 2016 g10 Code GmbH * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * * OCB is covered by several patents but may be used freely by most * software. See http://web.cs.ucdavis.edu/~rogaway/ocb/license.htm . * In particular license 1 is suitable for Libgcrypt: See * http://web.cs.ucdavis.edu/~rogaway/ocb/license1.pdf for the full * license document; it basically says: * * License 1 — License for Open-Source Software Implementations of OCB * (Jan 9, 2013) * * Under this license, you are authorized to make, use, and * distribute open-source software implementations of OCB. This * license terminates for you if you sue someone over their * open-source software implementation of OCB claiming that you have * a patent covering their implementation. */ #include #include #include #include #include #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "./cipher-internal.h" /* Double the OCB_BLOCK_LEN sized block B in-place. */ static inline void double_block (unsigned char *b) { #if OCB_BLOCK_LEN != 16 unsigned char b_0 = b[0]; int i; for (i=0; i < OCB_BLOCK_LEN - 1; i++) b[i] = (b[i] << 1) | (b[i+1] >> 7); b[OCB_BLOCK_LEN-1] = (b[OCB_BLOCK_LEN-1] << 1) ^ ((b_0 >> 7) * 135); #else /* This is the generic code for 16 byte blocks. However it is not faster than the straight byte by byte implementation. */ u64 l_0, l, r; l = buf_get_be64 (b); r = buf_get_be64 (b + 8); l_0 = -(l >> 63); l = (l + l) ^ (r >> 63); r = (r + r) ^ (l_0 & 135); buf_put_be64 (b, l); buf_put_be64 (b+8, r); #endif } /* Double the OCB_BLOCK_LEN sized block S and store it at D. S and D may point to the same memory location but they may not overlap. */ static void double_block_cpy (unsigned char *d, const unsigned char *s) { if (d != s) cipher_block_cpy (d, s, OCB_BLOCK_LEN); double_block (d); } /* Copy NBYTES from buffer S starting at bit offset BITOFF to buffer D. */ static void bit_copy (unsigned char *d, const unsigned char *s, unsigned int bitoff, unsigned int nbytes) { unsigned int shift; s += bitoff / 8; shift = bitoff % 8; if (shift) { for (; nbytes; nbytes--, d++, s++) *d = (s[0] << shift) | (s[1] >> (8 - shift)); } else { for (; nbytes; nbytes--, d++, s++) *d = *s; } } /* Get L_big value for block N, where N is multiple of 65536. */ static void ocb_get_L_big (gcry_cipher_hd_t c, u64 n, unsigned char *l_buf) { int ntz = _gcry_ctz64 (n); gcry_assert(ntz >= OCB_L_TABLE_SIZE); double_block_cpy (l_buf, c->u_mode.ocb.L[OCB_L_TABLE_SIZE - 1]); for (ntz -= OCB_L_TABLE_SIZE; ntz; ntz--) double_block (l_buf); } /* Called after key has been set. Sets up L table. */ void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c) { unsigned char ktop[OCB_BLOCK_LEN]; unsigned int burn = 0; unsigned int nburn; int i; /* L_star = E(zero_128) */ memset (ktop, 0, OCB_BLOCK_LEN); nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.L_star, ktop); burn = nburn > burn ? nburn : burn; /* L_dollar = double(L_star) */ double_block_cpy (c->u_mode.ocb.L_dollar, c->u_mode.ocb.L_star); /* L_0 = double(L_dollar), ... */ double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar); for (i = 1; i < OCB_L_TABLE_SIZE; i++) double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]); - /* Precalculated offsets L0+L1, L0+L1+L0 */ + /* Precalculated offset L0+L1 */ cipher_block_xor (c->u_mode.ocb.L0L1, c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN); - cipher_block_xor (c->u_mode.ocb.L0L1L0, - c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN); /* Cleanup */ wipememory (ktop, sizeof ktop); if (burn > 0) _gcry_burn_stack (burn + 4*sizeof(void*)); } /* Set the nonce for OCB. This requires that the key has been set. Using it again resets start a new encryption cycle using the same key. */ gcry_err_code_t _gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen) { unsigned char ktop[OCB_BLOCK_LEN]; unsigned char stretch[OCB_BLOCK_LEN + 8]; unsigned int bottom; unsigned int burn = 0; unsigned int nburn; /* Check args. */ if (!c->marks.key) return GPG_ERR_INV_STATE; /* Key must have been set first. */ switch (c->u_mode.ocb.taglen) { case 8: case 12: case 16: break; default: return GPG_ERR_BUG; /* Invalid tag length. */ } if (c->spec->blocksize != OCB_BLOCK_LEN) return GPG_ERR_CIPHER_ALGO; if (!nonce) return GPG_ERR_INV_ARG; /* 120 bit is the allowed maximum. In addition we impose a minimum of 64 bit. */ if (noncelen > (120/8) || noncelen < (64/8) || noncelen >= OCB_BLOCK_LEN) return GPG_ERR_INV_LENGTH; /* Prepare the nonce. */ memset (ktop, 0, (OCB_BLOCK_LEN - noncelen)); buf_cpy (ktop + (OCB_BLOCK_LEN - noncelen), nonce, noncelen); ktop[0] = ((c->u_mode.ocb.taglen * 8) % 128) << 1; ktop[OCB_BLOCK_LEN - noncelen - 1] |= 1; bottom = ktop[OCB_BLOCK_LEN - 1] & 0x3f; ktop[OCB_BLOCK_LEN - 1] &= 0xc0; /* Zero the bottom bits. */ nburn = c->spec->encrypt (&c->context.c, ktop, ktop); burn = nburn > burn ? nburn : burn; /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */ cipher_block_cpy (stretch, ktop, OCB_BLOCK_LEN); cipher_block_xor (stretch + OCB_BLOCK_LEN, ktop, ktop + 1, 8); /* Offset_0 = Stretch[1+bottom..128+bottom] (We use the IV field to store the offset) */ bit_copy (c->u_iv.iv, stretch, bottom, OCB_BLOCK_LEN); c->marks.iv = 1; /* Checksum_0 = zeros(128) (We use the CTR field to store the checksum) */ memset (c->u_ctr.ctr, 0, OCB_BLOCK_LEN); /* Clear AAD buffer. */ memset (c->u_mode.ocb.aad_offset, 0, OCB_BLOCK_LEN); memset (c->u_mode.ocb.aad_sum, 0, OCB_BLOCK_LEN); /* Setup other values. */ memset (c->lastiv, 0, sizeof(c->lastiv)); c->unused = 0; c->marks.tag = 0; c->marks.finalize = 0; c->u_mode.ocb.data_nblocks = 0; c->u_mode.ocb.aad_nblocks = 0; c->u_mode.ocb.aad_nleftover = 0; c->u_mode.ocb.data_finalized = 0; c->u_mode.ocb.aad_finalized = 0; /* log_printhex ("L_* ", c->u_mode.ocb.L_star, OCB_BLOCK_LEN); */ /* log_printhex ("L_$ ", c->u_mode.ocb.L_dollar, OCB_BLOCK_LEN); */ /* log_printhex ("L_0 ", c->u_mode.ocb.L[0], OCB_BLOCK_LEN); */ /* log_printhex ("L_1 ", c->u_mode.ocb.L[1], OCB_BLOCK_LEN); */ /* log_debug ( "bottom : %u (decimal)\n", bottom); */ /* log_printhex ("Ktop ", ktop, OCB_BLOCK_LEN); */ /* log_printhex ("Stretch ", stretch, sizeof stretch); */ /* log_printhex ("Offset_0 ", c->u_iv.iv, OCB_BLOCK_LEN); */ /* Cleanup */ wipememory (ktop, sizeof ktop); wipememory (stretch, sizeof stretch); if (burn > 0) _gcry_burn_stack (burn + 4*sizeof(void*)); return 0; } /* Process additional authentication data. This implementation allows to add additional authentication data at any time before the final gcry_cipher_gettag. */ gcry_err_code_t _gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen) { const size_t table_maxblks = 1 << OCB_L_TABLE_SIZE; const u32 table_size_mask = ((1 << OCB_L_TABLE_SIZE) - 1); unsigned char l_tmp[OCB_BLOCK_LEN]; unsigned int burn = 0; unsigned int nburn; /* Check that a nonce and thus a key has been set and that we have not yet computed the tag. We also return an error if the aad has been finalized (i.e. a short block has been processed). */ if (!c->marks.iv || c->marks.tag || c->u_mode.ocb.aad_finalized) return GPG_ERR_INV_STATE; /* Check correct usage and arguments. */ if (c->spec->blocksize != OCB_BLOCK_LEN) return GPG_ERR_CIPHER_ALGO; /* Process remaining data from the last call first. */ if (c->u_mode.ocb.aad_nleftover) { for (; abuflen && c->u_mode.ocb.aad_nleftover < OCB_BLOCK_LEN; abuf++, abuflen--) c->u_mode.ocb.aad_leftover[c->u_mode.ocb.aad_nleftover++] = *abuf; if (c->u_mode.ocb.aad_nleftover == OCB_BLOCK_LEN) { c->u_mode.ocb.aad_nblocks++; if ((c->u_mode.ocb.aad_nblocks % table_maxblks) == 0) { /* Table overflow, L needs to be generated. */ ocb_get_L_big(c, c->u_mode.ocb.aad_nblocks + 1, l_tmp); } else { cipher_block_cpy (l_tmp, ocb_get_l (c, c->u_mode.ocb.aad_nblocks), OCB_BLOCK_LEN); } /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l_tmp, OCB_BLOCK_LEN); /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_leftover, OCB_BLOCK_LEN); nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp); burn = nburn > burn ? nburn : burn; cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN); c->u_mode.ocb.aad_nleftover = 0; } } if (!abuflen) { if (burn > 0) _gcry_burn_stack (burn + 4*sizeof(void*)); return 0; } /* Full blocks handling. */ while (abuflen >= OCB_BLOCK_LEN) { size_t nblks = abuflen / OCB_BLOCK_LEN; size_t nmaxblks; /* Check how many blocks to process till table overflow. */ nmaxblks = (c->u_mode.ocb.aad_nblocks + 1) % table_maxblks; nmaxblks = (table_maxblks - nmaxblks) % table_maxblks; if (nmaxblks == 0) { /* Table overflow, generate L and process one block. */ c->u_mode.ocb.aad_nblocks++; ocb_get_L_big(c, c->u_mode.ocb.aad_nblocks, l_tmp); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l_tmp, OCB_BLOCK_LEN); /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, OCB_BLOCK_LEN); nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp); burn = nburn > burn ? nburn : burn; cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN); abuf += OCB_BLOCK_LEN; abuflen -= OCB_BLOCK_LEN; nblks--; /* With overflow handled, retry loop again. Next overflow will * happen after 65535 blocks. */ continue; } nblks = nblks < nmaxblks ? nblks : nmaxblks; /* Use a bulk method if available. */ if (nblks && c->bulk.ocb_auth) { size_t nleft; size_t ndone; nleft = c->bulk.ocb_auth (c, abuf, nblks); ndone = nblks - nleft; abuf += ndone * OCB_BLOCK_LEN; abuflen -= ndone * OCB_BLOCK_LEN; nblks = nleft; } /* Hash all full blocks. */ while (nblks) { c->u_mode.ocb.aad_nblocks++; gcry_assert(c->u_mode.ocb.aad_nblocks & table_size_mask); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_mode.ocb.aad_offset, ocb_get_l (c, c->u_mode.ocb.aad_nblocks), OCB_BLOCK_LEN); /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, OCB_BLOCK_LEN); nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp); burn = nburn > burn ? nburn : burn; cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN); abuf += OCB_BLOCK_LEN; abuflen -= OCB_BLOCK_LEN; nblks--; } } /* Store away the remaining data. */ for (; abuflen && c->u_mode.ocb.aad_nleftover < OCB_BLOCK_LEN; abuf++, abuflen--) c->u_mode.ocb.aad_leftover[c->u_mode.ocb.aad_nleftover++] = *abuf; gcry_assert (!abuflen); if (burn > 0) _gcry_burn_stack (burn + 4*sizeof(void*)); return 0; } /* Hash final partial AAD block. */ static void ocb_aad_finalize (gcry_cipher_hd_t c) { unsigned char l_tmp[OCB_BLOCK_LEN]; unsigned int burn = 0; unsigned int nburn; /* Check that a nonce and thus a key has been set and that we have not yet computed the tag. We also skip this if the aad has been finalized. */ if (!c->marks.iv || c->marks.tag || c->u_mode.ocb.aad_finalized) return; if (c->spec->blocksize != OCB_BLOCK_LEN) return; /* Ooops. */ /* Hash final partial block if any. */ if (c->u_mode.ocb.aad_nleftover) { /* Offset_* = Offset_m xor L_* */ cipher_block_xor_1 (c->u_mode.ocb.aad_offset, c->u_mode.ocb.L_star, OCB_BLOCK_LEN); /* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */ buf_cpy (l_tmp, c->u_mode.ocb.aad_leftover, c->u_mode.ocb.aad_nleftover); memset (l_tmp + c->u_mode.ocb.aad_nleftover, 0, OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover); l_tmp[c->u_mode.ocb.aad_nleftover] = 0x80; cipher_block_xor_1 (l_tmp, c->u_mode.ocb.aad_offset, OCB_BLOCK_LEN); /* Sum = Sum_m xor ENCIPHER(K, CipherInput) */ nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp); burn = nburn > burn ? nburn : burn; cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN); c->u_mode.ocb.aad_nleftover = 0; } /* Mark AAD as finalized so that gcry_cipher_ocb_authenticate can * return an erro when called again. */ c->u_mode.ocb.aad_finalized = 1; if (burn > 0) _gcry_burn_stack (burn + 4*sizeof(void*)); } /* Checksumming for encrypt and decrypt. */ static void ocb_checksum (unsigned char *chksum, const unsigned char *plainbuf, size_t nblks) { while (nblks > 0) { /* Checksum_i = Checksum_{i-1} xor P_i */ cipher_block_xor_1(chksum, plainbuf, OCB_BLOCK_LEN); plainbuf += OCB_BLOCK_LEN; nblks--; } } /* Common code for encrypt and decrypt. */ static gcry_err_code_t ocb_crypt (gcry_cipher_hd_t c, int encrypt, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { const size_t table_maxblks = 1 << OCB_L_TABLE_SIZE; const u32 table_size_mask = ((1 << OCB_L_TABLE_SIZE) - 1); unsigned char l_tmp[OCB_BLOCK_LEN]; unsigned int burn = 0; unsigned int nburn; gcry_cipher_encrypt_t crypt_fn = encrypt ? c->spec->encrypt : c->spec->decrypt; /* Check that a nonce and thus a key has been set and that we are not yet in end of data state. */ if (!c->marks.iv || c->u_mode.ocb.data_finalized) return GPG_ERR_INV_STATE; /* Check correct usage and arguments. */ if (c->spec->blocksize != OCB_BLOCK_LEN) return GPG_ERR_CIPHER_ALGO; if (outbuflen < inbuflen) return GPG_ERR_BUFFER_TOO_SHORT; if (c->marks.finalize) ; /* Allow arbitarty length. */ else if ((inbuflen % OCB_BLOCK_LEN)) return GPG_ERR_INV_LENGTH; /* We support only full blocks for now. */ /* Full blocks handling. */ while (inbuflen >= OCB_BLOCK_LEN) { size_t nblks = inbuflen / OCB_BLOCK_LEN; size_t nmaxblks; /* Check how many blocks to process till table overflow. */ nmaxblks = (c->u_mode.ocb.data_nblocks + 1) % table_maxblks; nmaxblks = (table_maxblks - nmaxblks) % table_maxblks; if (nmaxblks == 0) { /* Table overflow, generate L and process one block. */ c->u_mode.ocb.data_nblocks++; ocb_get_L_big(c, c->u_mode.ocb.data_nblocks, l_tmp); if (encrypt) { /* Checksum_i = Checksum_{i-1} xor P_i */ ocb_checksum (c->u_ctr.ctr, inbuf, 1); } /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_iv.iv, l_tmp, OCB_BLOCK_LEN); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ cipher_block_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN); nburn = crypt_fn (&c->context.c, outbuf, outbuf); burn = nburn > burn ? nburn : burn; cipher_block_xor_1 (outbuf, c->u_iv.iv, OCB_BLOCK_LEN); if (!encrypt) { /* Checksum_i = Checksum_{i-1} xor P_i */ ocb_checksum (c->u_ctr.ctr, outbuf, 1); } inbuf += OCB_BLOCK_LEN; inbuflen -= OCB_BLOCK_LEN; outbuf += OCB_BLOCK_LEN; outbuflen =- OCB_BLOCK_LEN; nblks--; /* With overflow handled, retry loop again. Next overflow will * happen after 65535 blocks. */ continue; } nblks = nblks < nmaxblks ? nblks : nmaxblks; /* Since checksum xoring is done before/after encryption/decryption, process input in 24KiB chunks to keep data loaded in L1 cache for checksumming. */ if (nblks > 24 * 1024 / OCB_BLOCK_LEN) nblks = 24 * 1024 / OCB_BLOCK_LEN; /* Use a bulk method if available. */ if (nblks && c->bulk.ocb_crypt) { size_t nleft; size_t ndone; nleft = c->bulk.ocb_crypt (c, outbuf, inbuf, nblks, encrypt); ndone = nblks - nleft; inbuf += ndone * OCB_BLOCK_LEN; outbuf += ndone * OCB_BLOCK_LEN; inbuflen -= ndone * OCB_BLOCK_LEN; outbuflen -= ndone * OCB_BLOCK_LEN; nblks = nleft; } if (nblks) { size_t nblks_chksum = nblks; if (encrypt) { /* Checksum_i = Checksum_{i-1} xor P_i */ ocb_checksum (c->u_ctr.ctr, inbuf, nblks_chksum); } /* Encrypt all full blocks. */ while (nblks) { c->u_mode.ocb.data_nblocks++; gcry_assert(c->u_mode.ocb.data_nblocks & table_size_mask); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ cipher_block_xor_1 (c->u_iv.iv, ocb_get_l (c, c->u_mode.ocb.data_nblocks), OCB_BLOCK_LEN); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ cipher_block_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN); nburn = crypt_fn (&c->context.c, outbuf, outbuf); burn = nburn > burn ? nburn : burn; cipher_block_xor_1 (outbuf, c->u_iv.iv, OCB_BLOCK_LEN); inbuf += OCB_BLOCK_LEN; inbuflen -= OCB_BLOCK_LEN; outbuf += OCB_BLOCK_LEN; outbuflen =- OCB_BLOCK_LEN; nblks--; } if (!encrypt) { /* Checksum_i = Checksum_{i-1} xor P_i */ ocb_checksum (c->u_ctr.ctr, outbuf - nblks_chksum * OCB_BLOCK_LEN, nblks_chksum); } } } /* Encrypt final partial block. Note that we expect INBUFLEN to be shorter than OCB_BLOCK_LEN (see above). */ if (inbuflen) { unsigned char pad[OCB_BLOCK_LEN]; /* Offset_* = Offset_m xor L_* */ cipher_block_xor_1 (c->u_iv.iv, c->u_mode.ocb.L_star, OCB_BLOCK_LEN); /* Pad = ENCIPHER(K, Offset_*) */ nburn = c->spec->encrypt (&c->context.c, pad, c->u_iv.iv); burn = nburn > burn ? nburn : burn; if (encrypt) { /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */ /* Note that INBUFLEN is less than OCB_BLOCK_LEN. */ buf_cpy (l_tmp, inbuf, inbuflen); memset (l_tmp + inbuflen, 0, OCB_BLOCK_LEN - inbuflen); l_tmp[inbuflen] = 0x80; cipher_block_xor_1 (c->u_ctr.ctr, l_tmp, OCB_BLOCK_LEN); /* C_* = P_* xor Pad[1..bitlen(P_*)] */ buf_xor (outbuf, inbuf, pad, inbuflen); } else { /* P_* = C_* xor Pad[1..bitlen(C_*)] */ /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */ cipher_block_cpy (l_tmp, pad, OCB_BLOCK_LEN); buf_cpy (l_tmp, inbuf, inbuflen); cipher_block_xor_1 (l_tmp, pad, OCB_BLOCK_LEN); l_tmp[inbuflen] = 0x80; buf_cpy (outbuf, l_tmp, inbuflen); cipher_block_xor_1 (c->u_ctr.ctr, l_tmp, OCB_BLOCK_LEN); } } /* Compute the tag if the finalize flag has been set. */ if (c->marks.finalize) { /* Tag = ENCIPHER(K, Checksum xor Offset xor L_$) xor HASH(K,A) */ cipher_block_xor (c->u_mode.ocb.tag, c->u_ctr.ctr, c->u_iv.iv, OCB_BLOCK_LEN); cipher_block_xor_1 (c->u_mode.ocb.tag, c->u_mode.ocb.L_dollar, OCB_BLOCK_LEN); nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.tag, c->u_mode.ocb.tag); burn = nburn > burn ? nburn : burn; c->u_mode.ocb.data_finalized = 1; /* Note that the the final part of the tag computation is done by _gcry_cipher_ocb_get_tag. */ } if (burn > 0) _gcry_burn_stack (burn + 4*sizeof(void*)); return 0; } /* Encrypt (INBUF,INBUFLEN) in OCB mode to OUTBUF. OUTBUFLEN gives the allocated size of OUTBUF. This function accepts only multiples of a full block unless gcry_cipher_final has been called in which case the next block may have any length. */ gcry_err_code_t _gcry_cipher_ocb_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { return ocb_crypt (c, 1, outbuf, outbuflen, inbuf, inbuflen); } /* Decrypt (INBUF,INBUFLEN) in OCB mode to OUTBUF. OUTBUFLEN gives the allocated size of OUTBUF. This function accepts only multiples of a full block unless gcry_cipher_final has been called in which case the next block may have any length. */ gcry_err_code_t _gcry_cipher_ocb_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { return ocb_crypt (c, 0, outbuf, outbuflen, inbuf, inbuflen); } /* Compute the tag. The last data operation has already done some part of it. To allow adding AAD even after having done all data, we finish the tag computation only here. */ static void compute_tag_if_needed (gcry_cipher_hd_t c) { if (!c->marks.tag) { ocb_aad_finalize (c); cipher_block_xor_1 (c->u_mode.ocb.tag, c->u_mode.ocb.aad_sum, OCB_BLOCK_LEN); c->marks.tag = 1; } } /* Copy the already computed tag to OUTTAG. OUTTAGSIZE is the allocated size of OUTTAG; the function returns an error if that is too short to hold the tag. */ gcry_err_code_t _gcry_cipher_ocb_get_tag (gcry_cipher_hd_t c, unsigned char *outtag, size_t outtagsize) { if (c->u_mode.ocb.taglen > outtagsize) return GPG_ERR_BUFFER_TOO_SHORT; if (!c->u_mode.ocb.data_finalized) return GPG_ERR_INV_STATE; /* Data has not yet been finalized. */ compute_tag_if_needed (c); memcpy (outtag, c->u_mode.ocb.tag, c->u_mode.ocb.taglen); return 0; } /* Check that the tag (INTAG,TAGLEN) matches the computed tag for the handle C. */ gcry_err_code_t _gcry_cipher_ocb_check_tag (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen) { size_t n; if (!c->u_mode.ocb.data_finalized) return GPG_ERR_INV_STATE; /* Data has not yet been finalized. */ compute_tag_if_needed (c); n = c->u_mode.ocb.taglen; if (taglen < n) n = taglen; if (!buf_eq_const (intag, c->u_mode.ocb.tag, n) || c->u_mode.ocb.taglen != taglen) return GPG_ERR_CHECKSUM; return 0; } diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index ec9f4d4a..9883861a 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -1,3524 +1,3524 @@ /* AES-NI accelerated AES for Libgcrypt * Copyright (C) 2000, 2001, 2002, 2003, 2007, * 2008, 2011, 2012 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_AESNI #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */ /* Prevent compiler from issuing SSE instructions between asm blocks. */ # pragma GCC target("no-sse") #endif #if __clang__ # pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function) #endif #define ALWAYS_INLINE inline __attribute__((always_inline)) #define NO_INLINE __attribute__((noinline)) typedef struct u128_s { u32 a, b, c, d; } __attribute__((packed, aligned(1), may_alias)) u128_t; /* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l because of 'pragma target'. */ static ALWAYS_INLINE const unsigned char * aes_ocb_get_l (gcry_cipher_hd_t c, u64 n) { unsigned long ntz; /* Assumes that N != 0. */ asm ("rep;bsfl %k[low], %k[ntz]\n\t" : [ntz] "=r" (ntz) : [low] "r" ((unsigned long)n) : "cc"); return c->u_mode.ocb.L[ntz]; } /* Two macros to be called prior and after the use of AESNI instructions. There should be no external function calls between the use of these macros. There purpose is to make sure that the SSE regsiters are cleared and won't reveal any information about the key or the data. */ #ifdef __WIN64__ /* XMM6-XMM15 are callee-saved registers on WIN64. */ # define aesni_prepare_2_7_variable char win64tmp[16 * 2] # define aesni_prepare_8_15_variable char win64tmp8_15[16 * 8] # define aesni_prepare() do { } while (0) # define aesni_prepare_2_7() \ do { asm volatile ("movdqu %%xmm6, %0\n\t" \ "movdqu %%xmm7, %1\n\t" \ : "=m" (*win64tmp), "=m" (*(win64tmp+16)) \ : \ : "memory"); \ } while (0) # define aesni_prepare_8_15() \ do { asm volatile ("movdqu %%xmm8, 0*16(%0)\n\t" \ "movdqu %%xmm9, 1*16(%0)\n\t" \ "movdqu %%xmm10, 2*16(%0)\n\t" \ "movdqu %%xmm11, 3*16(%0)\n\t" \ "movdqu %%xmm12, 4*16(%0)\n\t" \ "movdqu %%xmm13, 5*16(%0)\n\t" \ "movdqu %%xmm14, 6*16(%0)\n\t" \ "movdqu %%xmm15, 7*16(%0)\n\t" \ : \ : "r" (win64tmp8_15) \ : "memory"); \ } while (0) # define aesni_cleanup() \ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ "pxor %%xmm1, %%xmm1\n" :: ); \ } while (0) # define aesni_cleanup_2_7() \ do { asm volatile ("movdqu %0, %%xmm6\n\t" \ "movdqu %1, %%xmm7\n\t" \ "pxor %%xmm2, %%xmm2\n" \ "pxor %%xmm3, %%xmm3\n" \ "pxor %%xmm4, %%xmm4\n" \ "pxor %%xmm5, %%xmm5\n" \ : \ : "m" (*win64tmp), "m" (*(win64tmp+16)) \ : "memory"); \ } while (0) # define aesni_cleanup_8_15() \ do { asm volatile ("movdqu 0*16(%0), %%xmm8\n\t" \ "movdqu 1*16(%0), %%xmm9\n\t" \ "movdqu 2*16(%0), %%xmm10\n\t" \ "movdqu 3*16(%0), %%xmm11\n\t" \ "movdqu 4*16(%0), %%xmm12\n\t" \ "movdqu 5*16(%0), %%xmm13\n\t" \ "movdqu 6*16(%0), %%xmm14\n\t" \ "movdqu 7*16(%0), %%xmm15\n\t" \ : \ : "r" (win64tmp8_15) \ : "memory"); \ } while (0) #else # define aesni_prepare_2_7_variable # define aesni_prepare() do { } while (0) # define aesni_prepare_2_7() do { } while (0) # define aesni_cleanup() \ do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \ "pxor %%xmm1, %%xmm1\n" :: ); \ } while (0) # define aesni_cleanup_2_7() \ do { asm volatile ("pxor %%xmm7, %%xmm7\n\t" \ "pxor %%xmm2, %%xmm2\n\t" \ "pxor %%xmm3, %%xmm3\n" \ "pxor %%xmm4, %%xmm4\n" \ "pxor %%xmm5, %%xmm5\n" \ "pxor %%xmm6, %%xmm6\n":: ); \ } while (0) # ifdef __x86_64__ # define aesni_prepare_8_15_variable # define aesni_prepare_8_15() do { } while (0) # define aesni_cleanup_8_15() \ do { asm volatile ("pxor %%xmm8, %%xmm8\n" \ "pxor %%xmm9, %%xmm9\n" \ "pxor %%xmm10, %%xmm10\n" \ "pxor %%xmm11, %%xmm11\n" \ "pxor %%xmm12, %%xmm12\n" \ "pxor %%xmm13, %%xmm13\n" \ "pxor %%xmm14, %%xmm14\n" \ "pxor %%xmm15, %%xmm15\n":: ); \ } while (0) # endif #endif void _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key) { aesni_prepare_2_7_variable; aesni_prepare(); aesni_prepare_2_7(); if (ctx->rounds < 12) { /* 128-bit key */ #define AESKEYGENASSIST_xmm1_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" #define AESKEY_EXPAND128 \ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm1, %%xmm3\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm1\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm1\n\t" \ "pslldq $4, %%xmm3\n\t" \ "pxor %%xmm3, %%xmm2\n\t" \ "pxor %%xmm2, %%xmm1\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x01) AESKEY_EXPAND128 "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x02) AESKEY_EXPAND128 "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x04) AESKEY_EXPAND128 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x08) AESKEY_EXPAND128 "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x10) AESKEY_EXPAND128 "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x20) AESKEY_EXPAND128 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x40) AESKEY_EXPAND128 "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x80) AESKEY_EXPAND128 "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x1b) AESKEY_EXPAND128 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x36) AESKEY_EXPAND128 "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm1_xmm2 #undef AESKEY_EXPAND128 } else if (ctx->rounds == 12) { /* 192-bit key */ #define AESKEYGENASSIST_xmm3_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" #define AESKEY_EXPAND192 \ "pshufd $0x55, %%xmm2, %%xmm2\n\t" \ "movdqu %%xmm1, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pxor %%xmm2, %%xmm1\n\t" \ "pshufd $0xff, %%xmm1, %%xmm2\n\t" \ "movdqu %%xmm3, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pxor %%xmm2, %%xmm3\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ "movq 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..23] */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x01) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x02) AESKEY_EXPAND192 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x04) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x08) AESKEY_EXPAND192 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x10) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x20) AESKEY_EXPAND192 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */ "movdqa %%xmm3, %%xmm5\n\t" AESKEYGENASSIST_xmm3_xmm2(0x40) AESKEY_EXPAND192 "shufpd $0, %%xmm1, %%xmm5\n\t" "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5 */ "movdqa %%xmm1, %%xmm6\n\t" "shufpd $1, %%xmm3, %%xmm6\n\t" "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6 */ AESKEYGENASSIST_xmm3_xmm2(0x80) AESKEY_EXPAND192 "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm3_xmm2 #undef AESKEY_EXPAND192 } else if (ctx->rounds > 12) { /* 256-bit key */ #define AESKEYGENASSIST_xmm1_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t" #define AESKEYGENASSIST_xmm3_xmm2(imm8) \ ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t" #define AESKEY_EXPAND256_A \ "pshufd $0xff, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm1, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm1\n\t" \ "pxor %%xmm2, %%xmm1\n\t" #define AESKEY_EXPAND256_B \ "pshufd $0xaa, %%xmm2, %%xmm2\n\t" \ "movdqa %%xmm3, %%xmm4\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pslldq $4, %%xmm4\n\t" \ "pxor %%xmm4, %%xmm3\n\t" \ "pxor %%xmm2, %%xmm3\n\t" asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */ "movdqu 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..31] */ "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */ "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x01) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x02) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x04) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x08) AESKEY_EXPAND256_A "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x10) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x20) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */ AESKEYGENASSIST_xmm1_xmm2(0x00) AESKEY_EXPAND256_B "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3 */ AESKEYGENASSIST_xmm3_xmm2(0x40) AESKEY_EXPAND256_A "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1 */ : : [key] "r" (key), [ksch] "r" (ctx->keyschenc) : "cc", "memory" ); #undef AESKEYGENASSIST_xmm1_xmm2 #undef AESKEYGENASSIST_xmm3_xmm2 #undef AESKEY_EXPAND256_A #undef AESKEY_EXPAND256_B } aesni_cleanup(); aesni_cleanup_2_7(); } /* Make a decryption key from an encryption key. */ static ALWAYS_INLINE void do_aesni_prepare_decryption (RIJNDAEL_context *ctx) { /* The AES-NI decrypt instructions use the Equivalent Inverse Cipher, thus we can't use the the standard decrypt key preparation. */ u128_t *ekey = (u128_t *)ctx->keyschenc; u128_t *dkey = (u128_t *)ctx->keyschdec; int rr; int r; #define DO_AESNI_AESIMC() \ asm volatile ("movdqa %[ekey], %%xmm1\n\t" \ /*"aesimc %%xmm1, %%xmm1\n\t"*/ \ ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \ "movdqa %%xmm1, %[dkey]" \ : [dkey] "=m" (dkey[r]) \ : [ekey] "m" (ekey[rr]) \ : "memory") dkey[0] = ekey[ctx->rounds]; r=1; rr=ctx->rounds-1; DO_AESNI_AESIMC(); r++; rr--; /* round 1 */ DO_AESNI_AESIMC(); r++; rr--; /* round 2 */ DO_AESNI_AESIMC(); r++; rr--; /* round 3 */ DO_AESNI_AESIMC(); r++; rr--; /* round 4 */ DO_AESNI_AESIMC(); r++; rr--; /* round 5 */ DO_AESNI_AESIMC(); r++; rr--; /* round 6 */ DO_AESNI_AESIMC(); r++; rr--; /* round 7 */ DO_AESNI_AESIMC(); r++; rr--; /* round 8 */ DO_AESNI_AESIMC(); r++; rr--; /* round 9 */ if (ctx->rounds > 10) { DO_AESNI_AESIMC(); r++; rr--; /* round 10 */ DO_AESNI_AESIMC(); r++; rr--; /* round 11 */ if (ctx->rounds > 12) { DO_AESNI_AESIMC(); r++; rr--; /* round 12 */ DO_AESNI_AESIMC(); r++; rr--; /* round 13 */ } } dkey[r] = ekey[0]; #undef DO_AESNI_AESIMC } void _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx) { aesni_prepare(); do_aesni_prepare_decryption (ctx); aesni_cleanup(); } /* Encrypt one block using the Intel AES-NI instructions. Block is input * and output through SSE register xmm0. */ static ALWAYS_INLINE void do_aesni_enc (const RIJNDAEL_context *ctx) { #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" asm volatile ("movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 "\n" : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm1_xmm0 #undef aesenclast_xmm1_xmm0 } /* Decrypt one block using the Intel AES-NI instructions. Block is input * and output through SSE register xmm0. */ static ALWAYS_INLINE void do_aesni_dec (const RIJNDAEL_context *ctx) { #define aesdec_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t" #define aesdeclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t" asm volatile ("movdqa (%[key]), %%xmm1\n\t" "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesdec_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Ldeclast%=:\n\t" aesdeclast_xmm1_xmm0 "\n" : : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesdec_xmm1_xmm0 #undef aesdeclast_xmm1_xmm0 } /* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4. */ static ALWAYS_INLINE void do_aesni_enc_vec4 (const RIJNDAEL_context *ctx) { #define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t" #define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t" #define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t" #define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t" #define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t" #define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t" #define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t" #define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t" asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x20(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x30(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x40(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x50(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x60(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x70(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x80(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0x90(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xa0(%[key]), %%xmm0\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xb0(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xc0(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xd0(%[key]), %%xmm0\n\t" aesenc_xmm0_xmm1 aesenc_xmm0_xmm2 aesenc_xmm0_xmm3 aesenc_xmm0_xmm4 "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" aesenclast_xmm0_xmm1 aesenclast_xmm0_xmm2 aesenclast_xmm0_xmm3 aesenclast_xmm0_xmm4 : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm0_xmm1 #undef aesenc_xmm0_xmm2 #undef aesenc_xmm0_xmm3 #undef aesenc_xmm0_xmm4 #undef aesenclast_xmm0_xmm1 #undef aesenclast_xmm0_xmm2 #undef aesenclast_xmm0_xmm3 #undef aesenclast_xmm0_xmm4 } /* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4. */ static ALWAYS_INLINE void do_aesni_dec_vec4 (const RIJNDAEL_context *ctx) { #define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t" #define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t" #define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t" #define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t" #define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t" #define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t" #define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t" #define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t" asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x20(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x30(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x40(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x50(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x60(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x70(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x80(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0x90(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xa0(%[key]), %%xmm0\n\t" "cmpl $10, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xb0(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xc0(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "jz .Ldeclast%=\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xd0(%[key]), %%xmm0\n\t" aesdec_xmm0_xmm1 aesdec_xmm0_xmm2 aesdec_xmm0_xmm3 aesdec_xmm0_xmm4 "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" aesdeclast_xmm0_xmm1 aesdeclast_xmm0_xmm2 aesdeclast_xmm0_xmm3 aesdeclast_xmm0_xmm4 : /* no output */ : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); #undef aesdec_xmm0_xmm1 #undef aesdec_xmm0_xmm2 #undef aesdec_xmm0_xmm3 #undef aesdec_xmm0_xmm4 #undef aesdeclast_xmm0_xmm1 #undef aesdeclast_xmm0_xmm2 #undef aesdeclast_xmm0_xmm3 #undef aesdeclast_xmm0_xmm4 } #ifdef __x86_64__ /* Encrypt eight blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ static ALWAYS_INLINE void do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) { asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "jb .Ldeclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "je .Ldeclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" "aesenc %%xmm0, %%xmm4\n\t" "aesenc %%xmm0, %%xmm8\n\t" "aesenc %%xmm0, %%xmm9\n\t" "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" "aesenclast %%xmm0, %%xmm1\n\t" "aesenclast %%xmm0, %%xmm2\n\t" "aesenclast %%xmm0, %%xmm3\n\t" "aesenclast %%xmm0, %%xmm4\n\t" "aesenclast %%xmm0, %%xmm8\n\t" "aesenclast %%xmm0, %%xmm9\n\t" "aesenclast %%xmm0, %%xmm10\n\t" "aesenclast %%xmm0, %%xmm11\n\t" : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); } /* Decrypt eight blocks using the Intel AES-NI instructions. Blocks are input * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */ static ALWAYS_INLINE void do_aesni_dec_vec8 (const RIJNDAEL_context *ctx) { asm volatile ("movdqa (%[key]), %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm0\n\t" "cmpl $12, %[rounds]\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" "jb .Ldeclast%=\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" "je .Ldeclast%=\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm0\n\t" "aesdec %%xmm0, %%xmm1\n\t" "aesdec %%xmm0, %%xmm2\n\t" "aesdec %%xmm0, %%xmm3\n\t" "aesdec %%xmm0, %%xmm4\n\t" "aesdec %%xmm0, %%xmm8\n\t" "aesdec %%xmm0, %%xmm9\n\t" "aesdec %%xmm0, %%xmm10\n\t" "aesdec %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" ".Ldeclast%=:\n\t" "aesdeclast %%xmm0, %%xmm1\n\t" "aesdeclast %%xmm0, %%xmm2\n\t" "aesdeclast %%xmm0, %%xmm3\n\t" "aesdeclast %%xmm0, %%xmm4\n\t" "aesdeclast %%xmm0, %%xmm8\n\t" "aesdeclast %%xmm0, %%xmm9\n\t" "aesdeclast %%xmm0, %%xmm10\n\t" "aesdeclast %%xmm0, %%xmm11\n\t" : /* no output */ : [key] "r" (ctx->keyschdec), [rounds] "r" (ctx->rounds) : "cc", "memory"); } #endif /* __x86_64__ */ /* Perform a CTR encryption round using the counter CTR and the input block A. Write the result to the output block B and update CTR. CTR needs to be a 16 byte aligned little-endian value. */ static void do_aesni_ctr (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm5\n\t" "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ (big endian) */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "cmpl $0xffffffff, 12(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "psubq %%xmm1, %%xmm5\n\t" /* add carry to upper 64bits */ ".Lno_carry%=:\n\t" "pshufb %%xmm6, %%xmm5\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ "pxor (%[key]), %%xmm0\n\t" /* xmm1 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 "movdqu %[src], %%xmm1\n\t" /* xmm1 := input */ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */ "movdqu %%xmm0, %[dst]" /* Store EncCTR. */ : [dst] "=m" (*b) : [src] "m" (*a), [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [rounds] "g" (ctx->rounds) : "cc", "memory"); #undef aesenc_xmm1_xmm0 #undef aesenclast_xmm1_xmm0 } /* Four blocks at a time variant of do_aesni_ctr. */ static void do_aesni_ctr_4 (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { static const byte bige_addb_const[4][16] __attribute__ ((aligned (16))) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 } }; const void *bige_addb = bige_addb_const; #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" #define aesenc_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t" #define aesenc_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t" #define aesenc_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t" #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" #define aesenclast_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t" #define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t" #define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t" /* Register usage: [key] keyschedule xmm0 CTR-0 xmm1 temp / round key xmm2 CTR-1 xmm3 CTR-2 xmm4 CTR-3 xmm5 copy of *ctr xmm6 endian swapping mask */ asm volatile (/* detect if 8-bit carry handling is needed */ "cmpb $0xfb, 15(%[ctr])\n\t" "ja .Ladd32bit%=\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */ "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */ "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */ "movdqa 3*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(4) */ "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */ "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */ "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */ "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(4) + CTR (xmm0) */ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "jmp .Lstore_ctr%=\n\t" ".Ladd32bit%=:\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ "movdqa %%xmm0, %%xmm2\n\t" "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ "movdqa %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "movl 12(%[ctr]), %%esi\n\t" "bswapl %%esi\n\t" "cmpl $0xfffffffc, %%esi\n\t" "jb .Lno_carry%=\n\t" /* no carry */ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */ "cmpl $0xfffffffe, %%esi\n\t" "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ /* esi == 0xffffffff */ "psubq %%xmm1, %%xmm2\n\t" ".Lcarry_xmm3%=:\n\t" "psubq %%xmm1, %%xmm3\n\t" ".Lcarry_xmm4%=:\n\t" "psubq %%xmm1, %%xmm4\n\t" ".Lcarry_xmm5%=:\n\t" "psubq %%xmm1, %%xmm5\n\t" ".Lno_carry%=:\n\t" "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ ".Lstore_ctr%=:\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ : : [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [addb] "r" (bige_addb) : "%esi", "cc", "memory"); asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x20(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x30(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x40(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x50(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x60(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x70(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x80(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0x90(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xa0(%[key]), %%xmm1\n\t" "cmpl $10, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xb0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xc0(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "jz .Lenclast%=\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xd0(%[key]), %%xmm1\n\t" aesenc_xmm1_xmm0 aesenc_xmm1_xmm2 aesenc_xmm1_xmm3 aesenc_xmm1_xmm4 "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" aesenclast_xmm1_xmm0 aesenclast_xmm1_xmm2 aesenclast_xmm1_xmm3 aesenclast_xmm1_xmm4 : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); asm volatile ("movdqu (%[src]), %%xmm1\n\t" /* Get block 1. */ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */ "movdqu %%xmm0, (%[dst])\n\t" /* Store block 1 */ "movdqu 16(%[src]), %%xmm1\n\t" /* Get block 2. */ "pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */ "movdqu %%xmm2, 16(%[dst])\n\t" /* Store block 2. */ "movdqu 32(%[src]), %%xmm1\n\t" /* Get block 3. */ "pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */ "movdqu %%xmm3, 32(%[dst])\n\t" /* Store block 3. */ "movdqu 48(%[src]), %%xmm1\n\t" /* Get block 4. */ "pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */ "movdqu %%xmm4, 48(%[dst])" /* Store block 4. */ : : [src] "r" (a), [dst] "r" (b) : "memory"); #undef aesenc_xmm1_xmm0 #undef aesenc_xmm1_xmm2 #undef aesenc_xmm1_xmm3 #undef aesenc_xmm1_xmm4 #undef aesenclast_xmm1_xmm0 #undef aesenclast_xmm1_xmm2 #undef aesenclast_xmm1_xmm3 #undef aesenclast_xmm1_xmm4 } #ifdef __x86_64__ /* Eight blocks at a time variant of do_aesni_ctr. */ static void do_aesni_ctr_8 (const RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *b, const unsigned char *a) { static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 } }; const void *bige_addb = bige_addb_const; /* Register usage: [key] keyschedule xmm0 CTR-0 xmm1 temp / round key xmm2 CTR-1 xmm3 CTR-2 xmm4 CTR-3 xmm5 copy of *ctr xmm6 endian swapping mask xmm8 CTR-4 xmm9 CTR-5 xmm10 CTR-6 xmm11 CTR-7 xmm12 temp xmm13 temp xmm14 temp xmm15 temp */ asm volatile (/* detect if 8-bit carry handling is needed */ "cmpb $0xf7, 15(%[ctr])\n\t" "ja .Ladd32bit%=\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */ "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */ "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */ "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */ "movdqa 3*16(%[addb]), %%xmm8\n\t" /* xmm8 := be(4) */ "movdqa 4*16(%[addb]), %%xmm9\n\t" /* xmm9 := be(5) */ "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */ "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */ "movdqa 7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) */ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */ "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */ "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */ "paddb %%xmm0, %%xmm8\n\t" /* xmm8 := be(4) + CTR (xmm0) */ "paddb %%xmm0, %%xmm9\n\t" /* xmm9 := be(5) + CTR (xmm0) */ "paddb %%xmm0, %%xmm10\n\t" /* xmm10 := be(6) + CTR (xmm0) */ "paddb %%xmm0, %%xmm11\n\t" /* xmm11 := be(7) + CTR (xmm0) */ "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(8) + CTR (xmm0) */ "jmp .Lstore_ctr%=\n\t" ".Ladd32bit%=:\n\t" "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */ "movdqa %%xmm0, %%xmm2\n\t" "pcmpeqd %%xmm1, %%xmm1\n\t" "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */ "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */ "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */ "movdqa %%xmm4, %%xmm8\n\t" /* xmm8 := xmm4 */ "psubq %%xmm1, %%xmm8\n\t" /* xmm8++ */ "movdqa %%xmm8, %%xmm9\n\t" /* xmm9 := xmm8 */ "psubq %%xmm1, %%xmm9\n\t" /* xmm9++ */ "movdqa %%xmm9, %%xmm10\n\t" /* xmm10 := xmm9 */ "psubq %%xmm1, %%xmm10\n\t" /* xmm10++ */ "movdqa %%xmm10, %%xmm11\n\t" /* xmm11 := xmm10 */ "psubq %%xmm1, %%xmm11\n\t" /* xmm11++ */ "movdqa %%xmm11, %%xmm5\n\t" /* xmm5 := xmm11 */ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */ /* detect if 64-bit carry handling is needed */ "cmpl $0xffffffff, 8(%[ctr])\n\t" "jne .Lno_carry%=\n\t" "movl 12(%[ctr]), %%esi\n\t" "bswapl %%esi\n\t" "cmpl $0xfffffff8, %%esi\n\t" "jb .Lno_carry%=\n\t" /* no carry */ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffff8 */ "cmpl $0xfffffffa, %%esi\n\t" "jb .Lcarry_xmm11%=\n\t" /* esi == 0xfffffff9 */ "je .Lcarry_xmm10%=\n\t" /* esi == 0xfffffffa */ "cmpl $0xfffffffc, %%esi\n\t" "jb .Lcarry_xmm9%=\n\t" /* esi == 0xfffffffb */ "je .Lcarry_xmm8%=\n\t" /* esi == 0xfffffffc */ "cmpl $0xfffffffe, %%esi\n\t" "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */ /* esi == 0xffffffff */ "psubq %%xmm1, %%xmm2\n\t" ".Lcarry_xmm3%=:\n\t" "psubq %%xmm1, %%xmm3\n\t" ".Lcarry_xmm4%=:\n\t" "psubq %%xmm1, %%xmm4\n\t" ".Lcarry_xmm8%=:\n\t" "psubq %%xmm1, %%xmm8\n\t" ".Lcarry_xmm9%=:\n\t" "psubq %%xmm1, %%xmm9\n\t" ".Lcarry_xmm10%=:\n\t" "psubq %%xmm1, %%xmm10\n\t" ".Lcarry_xmm11%=:\n\t" "psubq %%xmm1, %%xmm11\n\t" ".Lcarry_xmm5%=:\n\t" "psubq %%xmm1, %%xmm5\n\t" ".Lno_carry%=:\n\t" "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */ "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */ "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */ "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */ "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */ "pshufb %%xmm6, %%xmm8\n\t" /* xmm8 := be(xmm8) */ "pshufb %%xmm6, %%xmm9\n\t" /* xmm9 := be(xmm9) */ "pshufb %%xmm6, %%xmm10\n\t" /* xmm10 := be(xmm10) */ "pshufb %%xmm6, %%xmm11\n\t" /* xmm11 := be(xmm11) */ ".Lstore_ctr%=:\n\t" "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */ : : [ctr] "r" (ctr), [key] "r" (ctx->keyschenc), [addb] "r" (bige_addb) : "%esi", "cc", "memory"); asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */ "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */ "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */ "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */ "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */ "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */ "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */ "movdqa 0x10(%[key]), %%xmm1\n\t" "cmpl $12, %[rounds]\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x20(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x30(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x40(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x50(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x60(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x70(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x80(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0x90(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm1\n\t" "jb .Lenclast%=\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xb0(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm1\n\t" "je .Lenclast%=\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xd0(%[key]), %%xmm1\n\t" "aesenc %%xmm1, %%xmm0\n\t" "aesenc %%xmm1, %%xmm2\n\t" "aesenc %%xmm1, %%xmm3\n\t" "aesenc %%xmm1, %%xmm4\n\t" "aesenc %%xmm1, %%xmm8\n\t" "aesenc %%xmm1, %%xmm9\n\t" "aesenc %%xmm1, %%xmm10\n\t" "aesenc %%xmm1, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm1\n" ".Lenclast%=:\n\t" : : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) : "cc", "memory"); asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1. */ "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2. */ "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3. */ "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4. */ "movdqu 4*16(%[src]), %%xmm7\n\t" /* Get block 5. */ "pxor %%xmm1, %%xmm12\n\t" /* block1 ^= lastkey */ "aesenclast %%xmm12, %%xmm0\n\t" "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6. */ "pxor %%xmm1, %%xmm13\n\t" /* block2 ^= lastkey */ "aesenclast %%xmm13, %%xmm2\n\t" "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7. */ "pxor %%xmm1, %%xmm14\n\t" /* block3 ^= lastkey */ "aesenclast %%xmm14, %%xmm3\n\t" "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8. */ "pxor %%xmm1, %%xmm15\n\t" /* block4 ^= lastkey */ "aesenclast %%xmm15, %%xmm4\n\t" "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */ "pxor %%xmm1, %%xmm7\n\t" /* block5 ^= lastkey */ "aesenclast %%xmm7, %%xmm8\n\t" "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1 */ "pxor %%xmm1, %%xmm12\n\t" /* block6 ^= lastkey */ "aesenclast %%xmm12, %%xmm9\n\t" "movdqu %%xmm2, 1*16(%[dst])\n\t" /* Store block 2. */ "pxor %%xmm1, %%xmm13\n\t" /* block7 ^= lastkey */ "aesenclast %%xmm13, %%xmm10\n\t" "movdqu %%xmm3, 2*16(%[dst])\n\t" /* Store block 3. */ "pxor %%xmm1, %%xmm14\n\t" /* block8 ^= lastkey */ "aesenclast %%xmm14, %%xmm11\n\t" "movdqu %%xmm4, 3*16(%[dst])\n\t" /* Store block 4. */ "movdqu %%xmm8, 4*16(%[dst])\n\t" /* Store block 8. */ "movdqu %%xmm9, 5*16(%[dst])\n\t" /* Store block 9. */ "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10. */ "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11. */ : : [src] "r" (a), [dst] "r" (b) : "memory"); } #endif /* __x86_64__ */ unsigned int _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { aesni_prepare (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_aesni_enc (ctx); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); aesni_cleanup (); return 0; } void _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare (); asm volatile ("movdqu %[iv], %%xmm0\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { do_aesni_enc (ctx); asm volatile ("movdqu %[inbuf], %%xmm1\n\t" "pxor %%xmm1, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); } void _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int cbc_mac) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); asm volatile ("movdqu %[iv], %%xmm5\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" : /* No output */ : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("movdqa %%xmm0, %%xmm5\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } void _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */ "movdqa %[ctr], %%xmm5\n\t" /* Preload CTR */ : /* No output */ : [mask] "m" (*be_mask), [ctr] "m" (*ctr) : "memory"); #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); for ( ;nblocks >= 8 ; nblocks -= 8 ) { do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { do_aesni_ctr (ctx, ctr, outbuf, inbuf); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } aesni_cleanup (); aesni_cleanup_2_7 (); } unsigned int _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { aesni_prepare (); asm volatile ("movdqu %[src], %%xmm0\n\t" : : [src] "m" (*src) : "memory" ); do_aesni_dec (ctx); asm volatile ("movdqu %%xmm0, %[dst]\n\t" : [dst] "=m" (*dst) : : "memory" ); aesni_cleanup (); return 0; } void _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); asm volatile ("movdqu %[iv], %%xmm6\n\t" : /* No output */ : [iv] "m" (*iv) : "memory" ); /* CFB decryption can be parallelized */ #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); for ( ;nblocks >= 8; nblocks -= 8) { asm volatile ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */ "movdqu 0*16(%[inbuf]), %%xmm2\n\t" "movdqu 1*16(%[inbuf]), %%xmm3\n\t" "movdqu 2*16(%[inbuf]), %%xmm4\n\t" "movdqu 3*16(%[inbuf]), %%xmm8\n\t" "movdqu 4*16(%[inbuf]), %%xmm9\n\t" "movdqu 5*16(%[inbuf]), %%xmm10\n\t" "movdqu 6*16(%[inbuf]), %%xmm11\n\t" "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */ "movdqa %%xmm2, %%xmm12\n\t" "movdqa %%xmm3, %%xmm13\n\t" "movdqa %%xmm4, %%xmm14\n\t" "movdqa %%xmm8, %%xmm15\n\t" : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_enc_vec8 (ctx); asm volatile ( "pxor %%xmm12, %%xmm1\n\t" "movdqu 4*16(%[inbuf]), %%xmm12\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqu 5*16(%[inbuf]), %%xmm13\n\t" "pxor %%xmm14, %%xmm3\n\t" "movdqu 6*16(%[inbuf]), %%xmm14\n\t" "pxor %%xmm15, %%xmm4\n\t" "movdqu 7*16(%[inbuf]), %%xmm15\n\t" "pxor %%xmm12, %%xmm8\n\t" "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "pxor %%xmm13, %%xmm9\n\t" "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "pxor %%xmm14, %%xmm10\n\t" "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "pxor %%xmm15, %%xmm11\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" "movdqu %%xmm8, 4*16(%[outbuf])\n\t" "movdqu %%xmm9, 5*16(%[outbuf])\n\t" "movdqu %%xmm10, 6*16(%[outbuf])\n\t" "movdqu %%xmm11, 7*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4; nblocks -= 4) { asm volatile ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */ "movdqu 0*16(%[inbuf]), %%xmm2\n\t" "movdqu 1*16(%[inbuf]), %%xmm3\n\t" "movdqu 2*16(%[inbuf]), %%xmm4\n\t" "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */ : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu 0*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "movdqu 1*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "movdqu 2*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "movdqu 3*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc"); for ( ;nblocks; nblocks-- ) { do_aesni_enc (ctx); asm volatile ("movdqa %%xmm0, %%xmm6\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" "movdqu %%xmm6, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : [inbuf] "m" (*inbuf) : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm0, %[iv]\n\t" : [iv] "=m" (*iv) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } void _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7(); if ( !ctx->decryption_prepared ) { do_aesni_prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } asm volatile ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */ : /* No output */ : [iv] "m" (*iv) : "memory"); #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); for ( ;nblocks >= 8 ; nblocks -= 8 ) { asm volatile ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ "movdqu 1*16(%[inbuf]), %%xmm2\n\t" "movdqu 2*16(%[inbuf]), %%xmm3\n\t" "movdqu 3*16(%[inbuf]), %%xmm4\n\t" "movdqu 4*16(%[inbuf]), %%xmm8\n\t" "movdqu 5*16(%[inbuf]), %%xmm9\n\t" "movdqu 6*16(%[inbuf]), %%xmm10\n\t" "movdqu 7*16(%[inbuf]), %%xmm11\n\t" "movdqa %%xmm1, %%xmm12\n\t" "movdqa %%xmm2, %%xmm13\n\t" "movdqa %%xmm3, %%xmm14\n\t" "movdqa %%xmm4, %%xmm15\n\t" : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_dec_vec8 (ctx); asm volatile ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */ "pxor %%xmm12, %%xmm2\n\t" /* xor IV with output */ "movdqu 4*16(%[inbuf]), %%xmm12\n\t" "pxor %%xmm13, %%xmm3\n\t" /* xor IV with output */ "movdqu 5*16(%[inbuf]), %%xmm13\n\t" "pxor %%xmm14, %%xmm4\n\t" /* xor IV with output */ "movdqu 6*16(%[inbuf]), %%xmm14\n\t" "pxor %%xmm15, %%xmm8\n\t" /* xor IV with output */ "movdqu 7*16(%[inbuf]), %%xmm5\n\t" "pxor %%xmm12, %%xmm9\n\t" /* xor IV with output */ "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "pxor %%xmm13, %%xmm10\n\t" /* xor IV with output */ "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "pxor %%xmm14, %%xmm11\n\t" /* xor IV with output */ "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "movdqu %%xmm4, 3*16(%[outbuf])\n\t" "movdqu %%xmm8, 4*16(%[outbuf])\n\t" "movdqu %%xmm9, 5*16(%[outbuf])\n\t" "movdqu %%xmm10, 6*16(%[outbuf])\n\t" "movdqu %%xmm11, 7*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { asm volatile ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */ "movdqu 1*16(%[inbuf]), %%xmm2\n\t" "movdqu 2*16(%[inbuf]), %%xmm3\n\t" "movdqu 3*16(%[inbuf]), %%xmm4\n\t" : /* No output */ : [inbuf] "r" (inbuf) : "memory"); do_aesni_dec_vec4 (ctx); asm volatile ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */ "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm1, 0*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */ "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm2, 1*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */ "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm3, 2*16(%[outbuf])\n\t" "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */ "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */ "movdqu %%xmm4, 3*16(%[outbuf])\n\t" : /* No output */ : [inbuf] "r" (inbuf), [outbuf] "r" (outbuf) : "memory"); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "movdqa %%xmm0, %%xmm2\n\t" /* use xmm2 as savebuf */ : /* No output */ : [inbuf] "m" (*inbuf) : "memory"); /* uses only xmm0 and xmm1 */ do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */ "movdqu %%xmm0, %[outbuf]\n\t" "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */ : [outbuf] "=m" (*outbuf) : : "memory"); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[iv]\n\t" /* store IV */ : /* No output */ : [iv] "m" (*iv) : "memory"); aesni_cleanup (); aesni_cleanup_2_7 (); } static ALWAYS_INLINE void aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; /* Calculate checksum */ asm volatile ("movdqu %[checksum], %%xmm6\n\t" "pxor %%xmm1, %%xmm1\n\t" "pxor %%xmm2, %%xmm2\n\t" "pxor %%xmm3, %%xmm3\n\t" : :[checksum] "m" (*c->u_ctr.ctr) : "memory" ); if (0) {} #if defined(HAVE_GCC_INLINE_ASM_AVX2) else if (nblocks >= 16 && ctx->use_avx2) { /* Use wider 256-bit registers for fast xoring of plaintext. */ asm volatile ("vzeroupper\n\t" "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" "vpxor %%xmm5, %%xmm5, %%xmm5\n\t" "vpxor %%xmm7, %%xmm7, %%xmm7\n\t" : : : "memory"); for (;nblocks >= 16; nblocks -= 16) { asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t" "vpxor %[ptr1], %%ymm1, %%ymm1\n\t" "vpxor %[ptr2], %%ymm2, %%ymm2\n\t" "vpxor %[ptr3], %%ymm3, %%ymm3\n\t" "vpxor %[ptr4], %%ymm0, %%ymm0\n\t" "vpxor %[ptr5], %%ymm4, %%ymm4\n\t" "vpxor %[ptr6], %%ymm5, %%ymm5\n\t" "vpxor %[ptr7], %%ymm7, %%ymm7\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)), [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)), [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)), [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)), [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)), [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)), [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)), [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2)) : "memory" ); plaintext += BLOCKSIZE * 16; } asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t" "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" "vpxor %%ymm5, %%ymm2, %%ymm2\n\t" "vpxor %%ymm7, %%ymm3, %%ymm3\n\t" "vextracti128 $1, %%ymm6, %%xmm0\n\t" "vextracti128 $1, %%ymm1, %%xmm4\n\t" "vextracti128 $1, %%ymm2, %%xmm5\n\t" "vextracti128 $1, %%ymm3, %%xmm7\n\t" "vpxor %%xmm0, %%xmm6, %%xmm6\n\t" "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" "vpxor %%xmm5, %%xmm2, %%xmm2\n\t" "vpxor %%xmm7, %%xmm3, %%xmm3\n\t" "vzeroupper\n\t" : : : "memory" ); } #endif #if defined(HAVE_GCC_INLINE_ASM_AVX) else if (nblocks >= 16 && ctx->use_avx) { /* Same as AVX2, except using 256-bit floating point instructions. */ asm volatile ("vzeroupper\n\t" "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t" : : : "memory"); for (;nblocks >= 16; nblocks -= 16) { asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t" "vxorpd %[ptr1], %%ymm1, %%ymm1\n\t" "vxorpd %[ptr2], %%ymm2, %%ymm2\n\t" "vxorpd %[ptr3], %%ymm3, %%ymm3\n\t" "vxorpd %[ptr4], %%ymm0, %%ymm0\n\t" "vxorpd %[ptr5], %%ymm4, %%ymm4\n\t" "vxorpd %[ptr6], %%ymm5, %%ymm5\n\t" "vxorpd %[ptr7], %%ymm7, %%ymm7\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)), [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)), [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)), [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)), [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)), [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)), [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)), [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2)) : "memory" ); plaintext += BLOCKSIZE * 16; } asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t" "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t" "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t" "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t" "vextractf128 $1, %%ymm6, %%xmm0\n\t" "vextractf128 $1, %%ymm1, %%xmm4\n\t" "vextractf128 $1, %%ymm2, %%xmm5\n\t" "vextractf128 $1, %%ymm3, %%xmm7\n\t" "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t" "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t" "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t" "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t" "vzeroupper\n\t" : : : "memory" ); } #endif for (;nblocks >= 4; nblocks -= 4) { asm volatile ("movdqu %[ptr0], %%xmm0\n\t" "movdqu %[ptr1], %%xmm4\n\t" "movdqu %[ptr2], %%xmm5\n\t" "movdqu %[ptr3], %%xmm7\n\t" "pxor %%xmm0, %%xmm6\n\t" "pxor %%xmm4, %%xmm1\n\t" "pxor %%xmm5, %%xmm2\n\t" "pxor %%xmm7, %%xmm3\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)), [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)), [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)), [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE)) : "memory" ); plaintext += BLOCKSIZE * 4; } for (;nblocks >= 1; nblocks -= 1) { asm volatile ("movdqu %[ptr0], %%xmm0\n\t" "pxor %%xmm0, %%xmm6\n\t" : : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)) : "memory" ); plaintext += BLOCKSIZE; } asm volatile ("pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm3, %%xmm6\n\t" "movdqu %%xmm6, %[checksum]\n\t" : [checksum] "=m" (*c->u_ctr.ctr) : : "memory" ); } static unsigned int NO_INLINE aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; const unsigned char *l; aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); aesni_ocb_checksum (c, inbuf_arg, nblocks); /* Preload Offset */ asm volatile ("movdqu %[iv], %%xmm5\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); asm volatile ("movdqu %[l0], %%xmm6\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" - "movdqu %[l0l1l0], %%xmm11\n\t" + "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */ asm volatile ("movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[inbuf1], %%xmm2\n\t" "movdqu %[inbuf2], %%xmm3\n\t" : : [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)), [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf3], %%xmm4\n\t" "movdqu %[inbuf4], %%xmm8\n\t" "movdqu %[inbuf5], %%xmm9\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)), [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)), [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm6, %%xmm12\n\t" "pxor %%xmm5, %%xmm12\n\t" "pxor %%xmm12, %%xmm1\n\t" "movdqa %%xmm10, %%xmm13\n\t" "pxor %%xmm5, %%xmm13\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqa %%xmm11, %%xmm14\n\t" "pxor %%xmm5, %%xmm14\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm15, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqa %%xmm5, %%xmm15\n\t" "movdqa %%xmm5, %%xmm0\n\t" "pxor %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm8\n\t" "movdqu %%xmm0, %[outbuf4]\n\t" "movdqa %%xmm10, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm9\n\t" "movdqu %%xmm0, %[outbuf5]\n\t" : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)), [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)) : : "memory" ); asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" "movdqa %%xmm11, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm10\n\t" "movdqu %%xmm0, %[outbuf6]\n\t" : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)) : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l7], %%xmm0\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm0, %%xmm5\n\t" "movdqu %[inbuf7], %%xmm11\n\t" "pxor %%xmm5, %%xmm11\n\t" : : [l7] "m" (*l), [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec8 (ctx); asm volatile ("pxor %%xmm12, %%xmm1\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqu %[outbuf4],%%xmm0\n\t" "movdqu %[outbuf5],%%xmm12\n\t" "movdqu %[outbuf6],%%xmm13\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm15, %%xmm4\n\t" "pxor %%xmm0, %%xmm8\n\t" "pxor %%xmm12, %%xmm9\n\t" "pxor %%xmm13, %%xmm10\n\t" "pxor %%xmm5, %%xmm11\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" "movdqu %%xmm8, %[outbuf4]\n\t" "movdqu %%xmm9, %[outbuf5]\n\t" "movdqu %%xmm10, %[outbuf6]\n\t" "movdqu %%xmm11, %[outbuf7]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)), [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)), [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)), [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) : : "memory" ); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm0\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[l0l1], %%xmm3\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm0, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm3, %%xmm2\n\t" "movdqu %%xmm3, %[outbuf1]\n\t" : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqu %%xmm0, %[outbuf2]\n\t" : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm6, %%xmm5\n\t" "pxor %%xmm4, %%xmm5\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf1],%%xmm0\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %[outbuf2],%%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) : : "memory" ); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" : [iv] "=m" (*c->u_iv.iv) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); return 0; } static unsigned int NO_INLINE aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks_arg) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; const unsigned char *l; size_t nblocks = nblocks_arg; aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); if ( !ctx->decryption_prepared ) { do_aesni_prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } /* Preload Offset */ asm volatile ("movdqu %[iv], %%xmm5\n\t" : /* No output */ : [iv] "m" (*c->u_iv.iv) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); asm volatile ("movdqu %[l0], %%xmm6\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l0l1], %%xmm10\n\t" - "movdqu %[l0l1l0], %%xmm11\n\t" + "movdqu %[l1], %%xmm11\n\t" "movdqu %[l3], %%xmm15\n\t" : : [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ asm volatile ("movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[inbuf1], %%xmm2\n\t" "movdqu %[inbuf2], %%xmm3\n\t" : : [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)), [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)), [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[inbuf3], %%xmm4\n\t" "movdqu %[inbuf4], %%xmm8\n\t" "movdqu %[inbuf5], %%xmm9\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)), [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)), [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm6, %%xmm12\n\t" "pxor %%xmm5, %%xmm12\n\t" "pxor %%xmm12, %%xmm1\n\t" "movdqa %%xmm10, %%xmm13\n\t" "pxor %%xmm5, %%xmm13\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqa %%xmm11, %%xmm14\n\t" "pxor %%xmm5, %%xmm14\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm15, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqa %%xmm5, %%xmm15\n\t" "movdqa %%xmm5, %%xmm0\n\t" "pxor %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm8\n\t" "movdqu %%xmm0, %[outbuf4]\n\t" "movdqa %%xmm10, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm9\n\t" "movdqu %%xmm0, %[outbuf5]\n\t" : [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)), [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)) : : "memory" ); asm volatile ("movdqu %[inbuf6], %%xmm10\n\t" "movdqa %%xmm11, %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm10\n\t" "movdqu %%xmm0, %[outbuf6]\n\t" : [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)) : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqu %[l7], %%xmm0\n\t" "pxor %%xmm11, %%xmm5\n\t" "pxor %%xmm0, %%xmm5\n\t" "movdqu %[inbuf7], %%xmm11\n\t" "pxor %%xmm5, %%xmm11\n\t" : : [l7] "m" (*l), [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)) : "memory" ); do_aesni_dec_vec8 (ctx); asm volatile ("pxor %%xmm12, %%xmm1\n\t" "pxor %%xmm13, %%xmm2\n\t" "movdqu %[outbuf4],%%xmm0\n\t" "movdqu %[outbuf5],%%xmm12\n\t" "movdqu %[outbuf6],%%xmm13\n\t" "pxor %%xmm14, %%xmm3\n\t" "pxor %%xmm15, %%xmm4\n\t" "pxor %%xmm0, %%xmm8\n\t" "pxor %%xmm12, %%xmm9\n\t" "pxor %%xmm13, %%xmm10\n\t" "pxor %%xmm5, %%xmm11\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" "movdqu %%xmm8, %[outbuf4]\n\t" "movdqu %%xmm9, %[outbuf5]\n\t" "movdqu %%xmm10, %[outbuf6]\n\t" "movdqu %%xmm11, %[outbuf7]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)), [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)), [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)), [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)), [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE)) : : "memory" ); outbuf += 8*BLOCKSIZE; inbuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm0\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "movdqu %[l0l1], %%xmm3\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm0, %[outbuf0]\n\t" : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm3, %%xmm2\n\t" "movdqu %%xmm3, %[outbuf1]\n\t" : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqu %%xmm0, %[outbuf2]\n\t" : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm6, %%xmm5\n\t" "pxor %%xmm4, %%xmm5\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_dec_vec4 (ctx); asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf1],%%xmm0\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %[outbuf2],%%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) : : "memory" ); outbuf += 4*BLOCKSIZE; inbuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ /* Checksum_i = Checksum_{i-1} xor P_i */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm5, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } c->u_mode.ocb.data_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" : [iv] "=m" (*c->u_iv.iv) : : "memory" ); aesni_ocb_checksum (c, outbuf_arg, nblocks_arg); aesni_cleanup (); aesni_cleanup_2_7 (); return 0; } size_t _gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { if (encrypt) return aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks); else return aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks); } size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; u64 n = c->u_mode.ocb.aad_nblocks; const unsigned char *l; aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); /* Preload Offset and Sum */ asm volatile ("movdqu %[iv], %%xmm5\n\t" "movdqu %[ctr], %%xmm6\n\t" : /* No output */ : [iv] "m" (*c->u_mode.ocb.aad_offset), [ctr] "m" (*c->u_mode.ocb.aad_sum) : "memory" ); for ( ;nblocks && n % 4; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[abuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm0, %%xmm6\n\t" : : : "memory" ); abuf += BLOCKSIZE; } #ifdef __x86_64__ if (nblocks >= 8) { aesni_prepare_8_15_variable; aesni_prepare_8_15(); asm volatile ("movdqu %[l0], %%xmm7\n\t" "movdqu %[l0l1], %%xmm12\n\t" - "movdqu %[l0l1l0], %%xmm13\n\t" + "movdqu %[l1], %%xmm13\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), - [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0) + [l1] "m" (*c->u_mode.ocb.L[1]) : "memory" ); for ( ;nblocks >= 8 ; nblocks -= 8 ) { n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l3], %%xmm0\n\t" "pxor %%xmm13, %%xmm0\n\t" : : [l3] "m" (*l) : "memory" ); n += 4; l = aes_ocb_get_l(c, n); asm volatile ("movdqu %[l7], %%xmm14\n\t" "pxor %%xmm13, %%xmm14\n\t" : : [l7] "m" (*l) : "memory" ); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[abuf0], %%xmm1\n\t" "movdqu %[abuf1], %%xmm2\n\t" "movdqu %[abuf2], %%xmm3\n\t" "movdqu %[abuf3], %%xmm4\n\t" "movdqu %[abuf4], %%xmm8\n\t" "movdqu %[abuf5], %%xmm9\n\t" "movdqu %[abuf6], %%xmm10\n\t" "movdqu %[abuf7], %%xmm11\n\t" : : [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)), [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)), [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)), [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)), [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)), [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)), [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)), [abuf7] "m" (*(abuf + 7 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm7, %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "pxor %%xmm12, %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "pxor %%xmm13, %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm0, %%xmm5\n\t" "pxor %%xmm5, %%xmm4\n\t" "pxor %%xmm7, %%xmm8\n\t" "pxor %%xmm5, %%xmm8\n\t" "pxor %%xmm12, %%xmm9\n\t" "pxor %%xmm5, %%xmm9\n\t" "pxor %%xmm13, %%xmm10\n\t" "pxor %%xmm5, %%xmm10\n\t" "pxor %%xmm14, %%xmm5\n\t" "pxor %%xmm5, %%xmm11\n\t" : : : "memory" ); do_aesni_enc_vec8 (ctx); asm volatile ("pxor %%xmm2, %%xmm1\n\t" "pxor %%xmm3, %%xmm1\n\t" "pxor %%xmm4, %%xmm1\n\t" "pxor %%xmm8, %%xmm1\n\t" "pxor %%xmm9, %%xmm6\n\t" "pxor %%xmm10, %%xmm6\n\t" "pxor %%xmm11, %%xmm6\n\t" "pxor %%xmm1, %%xmm6\n\t" : : : "memory" ); abuf += 8*BLOCKSIZE; } aesni_cleanup_8_15(); } #endif for ( ;nblocks >= 4 ; nblocks -= 4 ) { n += 4; l = aes_ocb_get_l(c, n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l0], %%xmm0\n\t" "movdqu %[abuf0], %%xmm1\n\t" "movdqu %[l0l1], %%xmm3\n\t" : : [l0] "m" (*c->u_mode.ocb.L[0]), [l0l1] "m" (*c->u_mode.ocb.L0L1), [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) : "memory" ); - asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t" + asm volatile ("movdqu %[l1], %%xmm4\n\t" "movdqu %[l3], %%xmm7\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" : - : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0), + : [l1] "m" (*c->u_mode.ocb.L[1]), [l3] "m" (*l) : "memory" ); asm volatile ("movdqu %[abuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm3, %%xmm2\n\t" : : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[abuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm0\n\t" "pxor %%xmm0, %%xmm3\n\t" : : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) : "memory" ); asm volatile ("pxor %%xmm7, %%xmm5\n\t" "pxor %%xmm4, %%xmm5\n\t" "movdqu %[abuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" : : [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("pxor %%xmm1, %%xmm6\n\t" "pxor %%xmm2, %%xmm6\n\t" "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm4, %%xmm6\n\t" : : : "memory" ); abuf += 4*BLOCKSIZE; } for ( ;nblocks; nblocks-- ) { l = aes_ocb_get_l(c, ++n); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ asm volatile ("movdqu %[l], %%xmm1\n\t" "movdqu %[abuf], %%xmm0\n\t" "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : : [l] "m" (*l), [abuf] "m" (*abuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm0, %%xmm6\n\t" : : : "memory" ); abuf += BLOCKSIZE; } c->u_mode.ocb.aad_nblocks = n; asm volatile ("movdqu %%xmm5, %[iv]\n\t" "movdqu %%xmm6, %[ctr]\n\t" : [iv] "=m" (*c->u_mode.ocb.aad_offset), [ctr] "=m" (*c->u_mode.ocb.aad_sum) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); return 0; } static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) = { 0x87, 0x01 }; static void _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); /* Preload Tweak */ asm volatile ("movdqu %[tweak], %%xmm5\n\t" "movdqa %[gfmul], %%xmm6\n\t" : : [tweak] "m" (*tweak), [gfmul] "m" (*xts_gfmul_const) : "memory" ); for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf0] "=m" (*(outbuf + 0 * 16)) : [inbuf0] "m" (*(inbuf + 0 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf1] "=m" (*(outbuf + 1 * 16)) : [inbuf1] "m" (*(inbuf + 1 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf2] "=m" (*(outbuf + 2 * 16)) : [inbuf2] "m" (*(inbuf + 2 * 16)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm5, %[outbuf3]\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf3] "=m" (*(outbuf + 3 * 16)) : [inbuf3] "m" (*(inbuf + 3 * 16)) : "memory" ); do_aesni_enc_vec4 (ctx); asm volatile ("movdqu %[outbuf0], %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %[outbuf1], %%xmm0\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf2], %%xmm1\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %[outbuf3], %%xmm0\n\t" "pxor %%xmm1, %%xmm3\n\t" "pxor %%xmm0, %%xmm4\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * 16)), [outbuf1] "+m" (*(outbuf + 1 * 16)), [outbuf2] "+m" (*(outbuf + 2 * 16)), [outbuf3] "+m" (*(outbuf + 3 * 16)) : : "memory" ); outbuf += BLOCKSIZE * 4; inbuf += BLOCKSIZE * 4; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "movdqa %%xmm5, %%xmm4\n\t" "pshufd $0x13, %%xmm5, %%xmm1\n\t" "psrad $31, %%xmm1\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm1\n\t" "pxor %%xmm1, %%xmm5\n\t" : : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_enc (ctx); asm volatile ("pxor %%xmm4, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[tweak]\n\t" : [tweak] "=m" (*tweak) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } static void _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { aesni_prepare_2_7_variable; aesni_prepare (); aesni_prepare_2_7 (); if ( !ctx->decryption_prepared ) { do_aesni_prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } /* Preload Tweak */ asm volatile ("movdqu %[tweak], %%xmm5\n\t" "movdqa %[gfmul], %%xmm6\n\t" : : [tweak] "m" (*tweak), [gfmul] "m" (*xts_gfmul_const) : "memory" ); for ( ;nblocks >= 4; nblocks -= 4 ) { asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t" "movdqu %[inbuf0], %%xmm1\n\t" "pxor %%xmm5, %%xmm1\n\t" "movdqu %%xmm5, %[outbuf0]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf0] "=m" (*(outbuf + 0 * 16)) : [inbuf0] "m" (*(inbuf + 0 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf1], %%xmm2\n\t" "pxor %%xmm5, %%xmm2\n\t" "movdqu %%xmm5, %[outbuf1]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf1] "=m" (*(outbuf + 1 * 16)) : [inbuf1] "m" (*(inbuf + 1 * 16)) : "memory" ); asm volatile ("movdqu %[inbuf2], %%xmm3\n\t" "pxor %%xmm5, %%xmm3\n\t" "movdqu %%xmm5, %[outbuf2]\n\t" "movdqa %%xmm4, %%xmm0\n\t" "paddd %%xmm4, %%xmm4\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf2] "=m" (*(outbuf + 2 * 16)) : [inbuf2] "m" (*(inbuf + 2 * 16)) : "memory" ); asm volatile ("movdqa %%xmm4, %%xmm0\n\t" "movdqu %[inbuf3], %%xmm4\n\t" "pxor %%xmm5, %%xmm4\n\t" "movdqu %%xmm5, %[outbuf3]\n\t" "psrad $31, %%xmm0\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm0\n\t" "pxor %%xmm0, %%xmm5\n\t" : [outbuf3] "=m" (*(outbuf + 3 * 16)) : [inbuf3] "m" (*(inbuf + 3 * 16)) : "memory" ); do_aesni_dec_vec4 (ctx); asm volatile ("movdqu %[outbuf0], %%xmm0\n\t" "pxor %%xmm0, %%xmm1\n\t" "movdqu %[outbuf1], %%xmm0\n\t" "movdqu %%xmm1, %[outbuf0]\n\t" "movdqu %[outbuf2], %%xmm1\n\t" "pxor %%xmm0, %%xmm2\n\t" "movdqu %[outbuf3], %%xmm0\n\t" "pxor %%xmm1, %%xmm3\n\t" "pxor %%xmm0, %%xmm4\n\t" "movdqu %%xmm2, %[outbuf1]\n\t" "movdqu %%xmm3, %[outbuf2]\n\t" "movdqu %%xmm4, %[outbuf3]\n\t" : [outbuf0] "+m" (*(outbuf + 0 * 16)), [outbuf1] "+m" (*(outbuf + 1 * 16)), [outbuf2] "+m" (*(outbuf + 2 * 16)), [outbuf3] "+m" (*(outbuf + 3 * 16)) : : "memory" ); outbuf += BLOCKSIZE * 4; inbuf += BLOCKSIZE * 4; } for ( ;nblocks; nblocks-- ) { asm volatile ("movdqu %[inbuf], %%xmm0\n\t" "pxor %%xmm5, %%xmm0\n\t" "movdqa %%xmm5, %%xmm4\n\t" "pshufd $0x13, %%xmm5, %%xmm1\n\t" "psrad $31, %%xmm1\n\t" "paddq %%xmm5, %%xmm5\n\t" "pand %%xmm6, %%xmm1\n\t" "pxor %%xmm1, %%xmm5\n\t" : : [inbuf] "m" (*inbuf) : "memory" ); do_aesni_dec (ctx); asm volatile ("pxor %%xmm4, %%xmm0\n\t" "movdqu %%xmm0, %[outbuf]\n\t" : [outbuf] "=m" (*outbuf) : : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[tweak]\n\t" : [tweak] "=m" (*tweak) : : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); } void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int encrypt) { if (encrypt) _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks); else _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks); } #if __clang__ # pragma clang attribute pop #endif #endif /* USE_AESNI */