diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S index 5c8fa3c0..66440bd4 100644 --- a/cipher/rijndael-armv8-aarch32-ce.S +++ b/cipher/rijndael-armv8-aarch32-ce.S @@ -1,1556 +1,1867 @@ /* rijndael-armv8-aarch32-ce.S - ARMv8/CE accelerated AES * Copyright (C) 2016 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) .syntax unified .arch armv8-a .fpu crypto-neon-fp-armv8 .arm .text #ifdef __PIC__ # define GET_DATA_POINTER(reg, name, rtmp) \ ldr reg, 1f; \ ldr rtmp, 2f; \ b 3f; \ 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ 2: .word name(GOT); \ 3: add reg, pc, reg; \ ldr reg, [reg, rtmp]; #else # define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name #endif /* AES macros */ #define aes_preload_keys(keysched, rekeysched) \ vldmia keysched!, {q5-q7}; \ mov rekeysched, keysched; \ vldmialo keysched!, {q8-q15}; /* 128-bit */ \ addeq keysched, #(2*16); \ vldmiaeq keysched!, {q10-q15}; /* 192-bit */ \ addhi keysched, #(4*16); \ vldmiahi keysched!, {q12-q15}; /* 256-bit */ \ #define do_aes_one128(ed, mcimc, qo, qb) \ aes##ed.8 qb, q5; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q6; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q7; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q8; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q9; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q10; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q11; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q12; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q13; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q14; \ veor qo, qb, q15; #define do_aes_one128re(ed, mcimc, qo, qb, keysched, rekeysched) \ vldm rekeysched, {q8-q9}; \ do_aes_one128(ed, mcimc, qo, qb); #define do_aes_one192(ed, mcimc, qo, qb, keysched, rekeysched) \ vldm rekeysched!, {q8}; \ aes##ed.8 qb, q5; \ aes##mcimc.8 qb, qb; \ vldm rekeysched, {q9}; \ aes##ed.8 qb, q6; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q7; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q8; \ aes##mcimc.8 qb, qb; \ vldmia keysched!, {q8}; \ aes##ed.8 qb, q9; \ aes##mcimc.8 qb, qb; \ sub rekeysched, #(1*16); \ aes##ed.8 qb, q10; \ aes##mcimc.8 qb, qb; \ vldm keysched, {q9}; \ aes##ed.8 qb, q11; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q12; \ aes##mcimc.8 qb, qb; \ sub keysched, #16; \ aes##ed.8 qb, q13; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q14; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q15; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q8; \ veor qo, qb, q9; \ #define do_aes_one256(ed, mcimc, qo, qb, keysched, rekeysched) \ vldmia rekeysched!, {q8}; \ aes##ed.8 qb, q5; \ aes##mcimc.8 qb, qb; \ vldmia rekeysched!, {q9}; \ aes##ed.8 qb, q6; \ aes##mcimc.8 qb, qb; \ vldmia rekeysched!, {q10}; \ aes##ed.8 qb, q7; \ aes##mcimc.8 qb, qb; \ vldm rekeysched, {q11}; \ aes##ed.8 qb, q8; \ aes##mcimc.8 qb, qb; \ vldmia keysched!, {q8}; \ aes##ed.8 qb, q9; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q10; \ aes##mcimc.8 qb, qb; \ vldmia keysched!, {q9}; \ aes##ed.8 qb, q11; \ aes##mcimc.8 qb, qb; \ sub rekeysched, #(3*16); \ aes##ed.8 qb, q12; \ aes##mcimc.8 qb, qb; \ vldmia keysched!, {q10}; \ aes##ed.8 qb, q13; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q14; \ aes##mcimc.8 qb, qb; \ vldm keysched, {q11}; \ aes##ed.8 qb, q15; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q8; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q9; \ aes##mcimc.8 qb, qb; \ aes##ed.8 qb, q10; \ veor qo, qb, q11; \ sub keysched, #(3*16); \ #define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \ aes##ed.8 b0, key; \ aes##mcimc.8 b0, b0; \ aes##ed.8 b1, key; \ aes##mcimc.8 b1, b1; \ aes##ed.8 b2, key; \ aes##mcimc.8 b2, b2; \ aes##ed.8 b3, key; \ aes##mcimc.8 b3, b3; #define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \ aes##ed.8 b0, q14; \ veor b0, b0, q15; \ aes##ed.8 b1, q14; \ veor b1, b1, q15; \ aes##ed.8 b2, q14; \ veor b2, b2, q15; \ aes##ed.8 b3, q14; \ veor b3, b3, q15; #define do_aes_4_128re(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \ vldm rekeysched, {q8-q9}; \ do_aes_4_128(ed, mcimc, b0, b1, b2, b3); #define do_aes_4_192(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \ vldm rekeysched!, {q8}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \ vldm rekeysched, {q9}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ vldmia keysched!, {q8}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ sub rekeysched, #(1*16); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \ vldm keysched, {q9}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \ sub keysched, #16; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \ aes##ed.8 b0, q8; \ veor b0, b0, q9; \ aes##ed.8 b1, q8; \ veor b1, b1, q9; \ aes##ed.8 b2, q8; \ veor b2, b2, q9; \ aes##ed.8 b3, q8; \ veor b3, b3, q9; #define do_aes_4_256(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \ vldmia rekeysched!, {q8}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \ vldmia rekeysched!, {q9}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \ vldmia rekeysched!, {q10}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \ vldm rekeysched, {q11}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ vldmia keysched!, {q8}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \ vldmia keysched!, {q9}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \ sub rekeysched, #(3*16); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \ vldmia keysched!, {q10}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \ vldm keysched, {q11}; \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ sub keysched, #(3*16); \ aes##ed.8 b0, q10; \ veor b0, b0, q11; \ aes##ed.8 b1, q10; \ veor b1, b1, q11; \ aes##ed.8 b2, q10; \ veor b2, b2, q11; \ aes##ed.8 b3, q10; \ veor b3, b3, q11; /* Other functional macros */ #define CLEAR_REG(reg) veor reg, reg; /* * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst, * const byte *src, * unsigned int nrounds); */ .align 3 .globl _gcry_aes_enc_armv8_ce .type _gcry_aes_enc_armv8_ce,%function; _gcry_aes_enc_armv8_ce: /* input: * r0: keysched * r1: dst * r2: src * r3: nrounds */ vldmia r0!, {q1-q3} /* load 3 round keys */ cmp r3, #12 vld1.8 {q0}, [r2] bhi .Lenc1_256 beq .Lenc1_192 .Lenc1_128: .Lenc1_tail: vldmia r0, {q8-q15} /* load 8 round keys */ aese.8 q0, q1 aesmc.8 q0, q0 CLEAR_REG(q1) aese.8 q0, q2 aesmc.8 q0, q0 CLEAR_REG(q2) aese.8 q0, q3 aesmc.8 q0, q0 CLEAR_REG(q3) aese.8 q0, q8 aesmc.8 q0, q0 CLEAR_REG(q8) aese.8 q0, q9 aesmc.8 q0, q0 CLEAR_REG(q9) aese.8 q0, q10 aesmc.8 q0, q0 CLEAR_REG(q10) aese.8 q0, q11 aesmc.8 q0, q0 CLEAR_REG(q11) aese.8 q0, q12 aesmc.8 q0, q0 CLEAR_REG(q12) aese.8 q0, q13 aesmc.8 q0, q0 CLEAR_REG(q13) aese.8 q0, q14 veor q0, q15 CLEAR_REG(q14) CLEAR_REG(q15) vst1.8 {q0}, [r1] CLEAR_REG(q0) mov r0, #0 bx lr .Lenc1_192: aese.8 q0, q1 aesmc.8 q0, q0 vmov q1, q3 aese.8 q0, q2 aesmc.8 q0, q0 vldm r0!, {q2-q3} /* load 3 round keys */ b .Lenc1_tail .Lenc1_256: vldm r0!, {q15} /* load 1 round key */ aese.8 q0, q1 aesmc.8 q0, q0 aese.8 q0, q2 aesmc.8 q0, q0 aese.8 q0, q3 aesmc.8 q0, q0 vldm r0!, {q1-q3} /* load 3 round keys */ aese.8 q0, q15 aesmc.8 q0, q0 b .Lenc1_tail .size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce; /* * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst, * const byte *src, * unsigned int nrounds); */ .align 3 .globl _gcry_aes_dec_armv8_ce .type _gcry_aes_dec_armv8_ce,%function; _gcry_aes_dec_armv8_ce: /* input: * r0: keysched * r1: dst * r2: src * r3: nrounds */ vldmia r0!, {q1-q3} /* load 3 round keys */ cmp r3, #12 vld1.8 {q0}, [r2] bhi .Ldec1_256 beq .Ldec1_192 .Ldec1_128: .Ldec1_tail: vldmia r0, {q8-q15} /* load 8 round keys */ aesd.8 q0, q1 aesimc.8 q0, q0 CLEAR_REG(q1) aesd.8 q0, q2 aesimc.8 q0, q0 CLEAR_REG(q2) aesd.8 q0, q3 aesimc.8 q0, q0 CLEAR_REG(q3) aesd.8 q0, q8 aesimc.8 q0, q0 CLEAR_REG(q8) aesd.8 q0, q9 aesimc.8 q0, q0 CLEAR_REG(q9) aesd.8 q0, q10 aesimc.8 q0, q0 CLEAR_REG(q10) aesd.8 q0, q11 aesimc.8 q0, q0 CLEAR_REG(q11) aesd.8 q0, q12 aesimc.8 q0, q0 CLEAR_REG(q12) aesd.8 q0, q13 aesimc.8 q0, q0 CLEAR_REG(q13) aesd.8 q0, q14 veor q0, q15 CLEAR_REG(q14) CLEAR_REG(q15) vst1.8 {q0}, [r1] CLEAR_REG(q0) mov r0, #0 bx lr .Ldec1_192: aesd.8 q0, q1 aesimc.8 q0, q0 vmov q1, q3 aesd.8 q0, q2 aesimc.8 q0, q0 vldm r0!, {q2-q3} /* load 3 round keys */ b .Ldec1_tail .Ldec1_256: vldm r0!, {q15} /* load 1 round key */ aesd.8 q0, q1 aesimc.8 q0, q0 aesd.8 q0, q2 aesimc.8 q0, q0 aesd.8 q0, q3 aesimc.8 q0, q0 vldm r0!, {q1-q3} /* load 3 round keys */ aesd.8 q0, q15 aesimc.8 q0, q0 b .Ldec1_tail .size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce; /* * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, size_t nblocks, * int cbc_mac, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cbc_enc_armv8_ce .type _gcry_aes_cbc_enc_armv8_ce,%function; _gcry_aes_cbc_enc_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * %st+0: nblocks => r4 * %st+4: cbc_mac => r5 * %st+8: nrounds => r6 */ push {r4-r6,lr} /* 4*4 = 16b */ ldr r4, [sp, #(16+0)] ldr r5, [sp, #(16+4)] cmp r4, #0 ldr r6, [sp, #(16+8)] beq .Lcbc_enc_skip cmp r5, #0 vpush {q4-q7} moveq r5, #16 movne r5, #0 cmp r6, #12 vld1.8 {q1}, [r3] /* load IV */ aes_preload_keys(r0, lr); beq .Lcbc_enc_loop192 bhi .Lcbc_enc_loop256 #define CBC_ENC(bits, ...) \ .Lcbc_enc_loop##bits: \ vld1.8 {q0}, [r2]!; /* load plaintext */ \ veor q1, q0, q1; \ subs r4, r4, #1; \ \ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ \ vst1.8 {q1}, [r1], r5; /* store ciphertext */ \ \ bne .Lcbc_enc_loop##bits; \ b .Lcbc_enc_done; CBC_ENC(128) CBC_ENC(192, r0, lr) CBC_ENC(256, r0, lr) #undef CBC_ENC .Lcbc_enc_done: vst1.8 {q1}, [r3] /* store IV */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) vpop {q4-q7} CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) .Lcbc_enc_skip: pop {r4-r6,pc} .size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce; /* * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cbc_dec_armv8_ce .type _gcry_aes_cbc_dec_armv8_ce,%function; _gcry_aes_cbc_dec_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * %st+0: nblocks => r4 * %st+4: nrounds => r5 */ push {r4-r6,lr} /* 4*4 = 16b */ ldr r4, [sp, #(16+0)] ldr r5, [sp, #(16+4)] cmp r4, #0 beq .Lcbc_dec_skip vpush {q4-q7} cmp r5, #12 vld1.8 {q0}, [r3] /* load IV */ aes_preload_keys(r0, r6); beq .Lcbc_dec_entry_192 bhi .Lcbc_dec_entry_256 #define CBC_DEC(bits, ...) \ .Lcbc_dec_entry_##bits: \ cmp r4, #4; \ blo .Lcbc_dec_loop_##bits; \ \ .Lcbc_dec_loop4_##bits: \ \ vld1.8 {q1-q2}, [r2]!; /* load ciphertext */ \ sub r4, r4, #4; \ vld1.8 {q3-q4}, [r2]; /* load ciphertext */ \ cmp r4, #4; \ sub r2, #32; \ \ do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \ \ veor q1, q1, q0; \ vld1.8 {q0}, [r2]!; /* load next IV */ \ veor q2, q2, q0; \ vld1.8 {q0}, [r2]!; /* load next IV */ \ vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ veor q3, q3, q0; \ vld1.8 {q0}, [r2]!; /* load next IV */ \ veor q4, q4, q0; \ vld1.8 {q0}, [r2]!; /* load next IV */ \ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ \ bhs .Lcbc_dec_loop4_##bits; \ cmp r4, #0; \ beq .Lcbc_dec_done; \ \ .Lcbc_dec_loop_##bits: \ vld1.8 {q1}, [r2]!; /* load ciphertext */ \ subs r4, r4, #1; \ vmov q2, q1; \ \ do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \ \ veor q1, q1, q0; \ vmov q0, q2; \ vst1.8 {q1}, [r1]!; /* store plaintext */ \ \ bne .Lcbc_dec_loop_##bits; \ b .Lcbc_dec_done; CBC_DEC(128) CBC_DEC(192, r0, r6) CBC_DEC(256, r0, r6) #undef CBC_DEC .Lcbc_dec_done: vst1.8 {q0}, [r3] /* store IV */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) vpop {q4-q7} CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) .Lcbc_dec_skip: pop {r4-r6,pc} .size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce; /* * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cfb_enc_armv8_ce .type _gcry_aes_cfb_enc_armv8_ce,%function; _gcry_aes_cfb_enc_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * %st+0: nblocks => r4 * %st+4: nrounds => r5 */ push {r4-r6,lr} /* 4*4 = 16b */ ldr r4, [sp, #(16+0)] ldr r5, [sp, #(16+4)] cmp r4, #0 beq .Lcfb_enc_skip vpush {q4-q7} cmp r5, #12 vld1.8 {q0}, [r3] /* load IV */ aes_preload_keys(r0, r6); beq .Lcfb_enc_entry_192 bhi .Lcfb_enc_entry_256 #define CFB_ENC(bits, ...) \ .Lcfb_enc_entry_##bits: \ .Lcfb_enc_loop_##bits: \ vld1.8 {q1}, [r2]!; /* load plaintext */ \ subs r4, r4, #1; \ \ do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \ \ veor q0, q1, q0; \ vst1.8 {q0}, [r1]!; /* store ciphertext */ \ \ bne .Lcfb_enc_loop_##bits; \ b .Lcfb_enc_done; CFB_ENC(128) CFB_ENC(192, r0, r6) CFB_ENC(256, r0, r6) #undef CFB_ENC .Lcfb_enc_done: vst1.8 {q0}, [r3] /* store IV */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) vpop {q4-q7} CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) .Lcfb_enc_skip: pop {r4-r6,pc} .size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce; /* * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cfb_dec_armv8_ce .type _gcry_aes_cfb_dec_armv8_ce,%function; _gcry_aes_cfb_dec_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * %st+0: nblocks => r4 * %st+4: nrounds => r5 */ push {r4-r6,lr} /* 4*4 = 16b */ ldr r4, [sp, #(16+0)] ldr r5, [sp, #(16+4)] cmp r4, #0 beq .Lcfb_dec_skip vpush {q4-q7} cmp r5, #12 vld1.8 {q0}, [r3] /* load IV */ aes_preload_keys(r0, r6); beq .Lcfb_dec_entry_192 bhi .Lcfb_dec_entry_256 #define CFB_DEC(bits, ...) \ .Lcfb_dec_entry_##bits: \ cmp r4, #4; \ blo .Lcfb_dec_loop_##bits; \ \ .Lcfb_dec_loop4_##bits: \ \ vld1.8 {q2-q3}, [r2]!; /* load ciphertext */ \ vmov q1, q0; \ sub r4, r4, #4; \ vld1.8 {q4}, [r2]; /* load ciphertext */ \ sub r2, #32; \ cmp r4, #4; \ \ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ \ vld1.8 {q0}, [r2]!; /* load ciphertext */ \ veor q1, q1, q0; \ vld1.8 {q0}, [r2]!; /* load ciphertext */ \ veor q2, q2, q0; \ vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ vld1.8 {q0}, [r2]!; \ veor q3, q3, q0; \ vld1.8 {q0}, [r2]!; /* load next IV / ciphertext */ \ veor q4, q4, q0; \ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ \ bhs .Lcfb_dec_loop4_##bits; \ cmp r4, #0; \ beq .Lcfb_dec_done; \ \ .Lcfb_dec_loop_##bits: \ \ vld1.8 {q1}, [r2]!; /* load ciphertext */ \ \ subs r4, r4, #1; \ \ do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \ \ veor q2, q1, q0; \ vmov q0, q1; \ vst1.8 {q2}, [r1]!; /* store plaintext */ \ \ bne .Lcfb_dec_loop_##bits; \ b .Lcfb_dec_done; CFB_DEC(128) CFB_DEC(192, r0, r6) CFB_DEC(256, r0, r6) #undef CFB_DEC .Lcfb_dec_done: vst1.8 {q0}, [r3] /* store IV */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) vpop {q4-q7} CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) .Lcfb_dec_skip: pop {r4-r6,pc} .size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce; /* * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_ctr_enc_armv8_ce .type _gcry_aes_ctr_enc_armv8_ce,%function; _gcry_aes_ctr_enc_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * %st+0: nblocks => r4 * %st+4: nrounds => r5 */ vpush {q4-q7} push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ ldr r4, [sp, #(104+0)] ldr r5, [sp, #(104+4)] cmp r4, #0 beq .Lctr_enc_skip cmp r5, #12 ldm r3, {r7-r10} vld1.8 {q0}, [r3] /* load IV */ rev r7, r7 rev r8, r8 rev r9, r9 rev r10, r10 aes_preload_keys(r0, r6); beq .Lctr_enc_entry_192 bhi .Lctr_enc_entry_256 #define CTR_ENC(bits, ...) \ .Lctr_enc_entry_##bits: \ cmp r4, #4; \ blo .Lctr_enc_loop_##bits; \ \ .Lctr_enc_loop4_##bits: \ cmp r10, #0xfffffffc; \ sub r4, r4, #4; \ blo .Lctr_enc_loop4_##bits##_nocarry; \ cmp r9, #0xffffffff; \ bne .Lctr_enc_loop4_##bits##_nocarry; \ \ adds r10, #1; \ vmov q1, q0; \ blcs .Lctr_overflow_one; \ rev r11, r10; \ vmov.32 d1[1], r11; \ \ adds r10, #1; \ vmov q2, q0; \ blcs .Lctr_overflow_one; \ rev r11, r10; \ vmov.32 d1[1], r11; \ \ adds r10, #1; \ vmov q3, q0; \ blcs .Lctr_overflow_one; \ rev r11, r10; \ vmov.32 d1[1], r11; \ \ adds r10, #1; \ vmov q4, q0; \ blcs .Lctr_overflow_one; \ rev r11, r10; \ vmov.32 d1[1], r11; \ \ b .Lctr_enc_loop4_##bits##_store_ctr; \ \ .Lctr_enc_loop4_##bits##_nocarry: \ \ veor q2, q2; \ vrev64.8 q1, q0; \ vceq.u32 d5, d5; \ vadd.u64 q3, q2, q2; \ vadd.u64 q4, q3, q2; \ vadd.u64 q0, q3, q3; \ vsub.u64 q2, q1, q2; \ vsub.u64 q3, q1, q3; \ vsub.u64 q4, q1, q4; \ vsub.u64 q0, q1, q0; \ vrev64.8 q1, q1; \ vrev64.8 q2, q2; \ vrev64.8 q3, q3; \ vrev64.8 q0, q0; \ vrev64.8 q4, q4; \ add r10, #4; \ \ .Lctr_enc_loop4_##bits##_store_ctr: \ \ vst1.8 {q0}, [r3]; \ cmp r4, #4; \ vld1.8 {q0}, [r2]!; /* load ciphertext */ \ \ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ \ veor q1, q1, q0; \ vld1.8 {q0}, [r2]!; /* load ciphertext */ \ vst1.8 {q1}, [r1]!; /* store plaintext */ \ vld1.8 {q1}, [r2]!; /* load ciphertext */ \ veor q2, q2, q0; \ veor q3, q3, q1; \ vld1.8 {q0}, [r2]!; /* load ciphertext */ \ vst1.8 {q2}, [r1]!; /* store plaintext */ \ veor q4, q4, q0; \ vld1.8 {q0}, [r3]; /* reload IV */ \ vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ \ bhs .Lctr_enc_loop4_##bits; \ cmp r4, #0; \ beq .Lctr_enc_done; \ \ .Lctr_enc_loop_##bits: \ \ adds r10, #1; \ vmov q1, q0; \ blcs .Lctr_overflow_one; \ rev r11, r10; \ subs r4, r4, #1; \ vld1.8 {q2}, [r2]!; /* load ciphertext */ \ vmov.32 d1[1], r11; \ \ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ \ veor q1, q2, q1; \ vst1.8 {q1}, [r1]!; /* store plaintext */ \ \ bne .Lctr_enc_loop_##bits; \ b .Lctr_enc_done; CTR_ENC(128) CTR_ENC(192, r0, r6) CTR_ENC(256, r0, r6) #undef CTR_ENC .Lctr_enc_done: vst1.8 {q0}, [r3] /* store IV */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) .Lctr_enc_skip: pop {r4-r12,lr} vpop {q4-q7} bx lr .Lctr_overflow_one: adcs r9, #0 adcs r8, #0 adc r7, #0 rev r11, r9 rev r12, r8 vmov.32 d1[0], r11 rev r11, r7 vmov.32 d0[1], r12 vmov.32 d0[0], r11 bx lr .size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce; /* * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, * unsigned char *checksum, * unsigned char *L_table, * size_t nblocks, * unsigned int nrounds, * unsigned int blkn); */ .align 3 .globl _gcry_aes_ocb_enc_armv8_ce .type _gcry_aes_ocb_enc_armv8_ce,%function; _gcry_aes_ocb_enc_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: offset * %st+0: checksum => r4 * %st+4: Ls => r5 * %st+8: nblocks => r6 (0 < nblocks <= 32) * %st+12: nrounds => r7 * %st+16: blkn => lr */ vpush {q4-q7} push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ ldr r7, [sp, #(104+12)] ldr r4, [sp, #(104+0)] ldr r5, [sp, #(104+4)] ldr r6, [sp, #(104+8)] ldr lr, [sp, #(104+16)] cmp r7, #12 vld1.8 {q0}, [r3] /* load offset */ aes_preload_keys(r0, r12); beq .Locb_enc_entry_192 bhi .Locb_enc_entry_256 #define OCB_ENC(bits, ...) \ .Locb_enc_entry_##bits: \ cmp r6, #4; \ add lr, #1; \ blo .Locb_enc_loop_##bits; \ \ .Locb_enc_loop4_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ \ add r9, lr, #1; \ add r10, lr, #2; \ add r11, lr, #3; \ rbit r8, lr; \ add lr, lr, #4; \ rbit r9, r9; \ rbit r10, r10; \ rbit r11, r11; \ clz r8, r8; /* ntz(i+0) */ \ clz r9, r9; /* ntz(i+1) */ \ clz r10, r10; /* ntz(i+2) */ \ clz r11, r11; /* ntz(i+3) */ \ add r8, r5, r8, lsl #4; \ add r9, r5, r9, lsl #4; \ add r10, r5, r10, lsl #4; \ add r11, r5, r11, lsl #4; \ \ sub r6, #4; \ \ vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \ vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \ vld1.8 {q8}, [r4]; /* load Checksum_{i-1} */ \ veor q0, q0, q9; /* Offset_i+0 */ \ vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \ veor q8, q8, q1; /* Checksum_i+0 */ \ veor q1, q1, q0; /* P_i+0 xor Offset_i+0 */\ vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \ vst1.8 {q0}, [r1]!; /* store Offset_i+0 */\ veor q0, q0, q9; /* Offset_i+1 */ \ vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \ veor q8, q8, q2; /* Checksum_i+1 */ \ veor q2, q2, q0; /* P_i+1 xor Offset_i+1 */\ vst1.8 {q0}, [r1]!; /* store Offset_i+1 */\ veor q0, q0, q9; /* Offset_i+2 */ \ vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \ veor q8, q8, q3; /* Checksum_i+2 */ \ veor q3, q3, q0; /* P_i+2 xor Offset_i+2 */\ vst1.8 {q0}, [r1]!; /* store Offset_i+2 */\ veor q0, q0, q9; /* Offset_i+3 */ \ veor q8, q8, q4; /* Checksum_i+3 */ \ veor q4, q4, q0; /* P_i+3 xor Offset_i+3 */\ vst1.8 {q0}, [r1]; /* store Offset_i+3 */\ sub r1, #(3*16); \ vst1.8 {q8}, [r4]; /* store Checksum_i+3 */\ \ cmp r6, #4; \ \ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ \ mov r8, r1; \ vld1.8 {q8-q9}, [r1]!; \ veor q1, q1, q8; \ veor q2, q2, q9; \ vld1.8 {q8-q9}, [r1]!; \ vst1.8 {q1-q2}, [r8]!; \ veor q3, q3, q8; \ veor q4, q4, q9; \ vst1.8 {q3-q4}, [r8]; \ \ bhs .Locb_enc_loop4_##bits; \ cmp r6, #0; \ beq .Locb_enc_done; \ \ .Locb_enc_loop_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ \ rbit r8, lr; \ add lr, #1; \ clz r8, r8; /* ntz(i) */ \ add r8, r5, r8, lsl #4; \ \ vld1.8 {q1}, [r2]!; /* load plaintext */ \ vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \ vld1.8 {q3}, [r4]; /* load checksum */ \ subs r6, #1; \ veor q0, q0, q2; \ veor q3, q3, q1; \ veor q1, q1, q0; \ vst1.8 {q3}, [r4]; /* store checksum */ \ \ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ \ veor q1, q1, q0; \ vst1.8 {q1}, [r1]!; /* store ciphertext */ \ \ bne .Locb_enc_loop_##bits; \ b .Locb_enc_done; OCB_ENC(128re, r0, r12) OCB_ENC(192, r0, r12) OCB_ENC(256, r0, r12) #undef OCB_ENC .Locb_enc_done: vst1.8 {q0}, [r3] /* store offset */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) pop {r4-r12,lr} vpop {q4-q7} bx lr .size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce; /* * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, * unsigned char *checksum, * unsigned char *L_table, * size_t nblocks, * unsigned int nrounds, * unsigned int blkn); */ .align 3 .globl _gcry_aes_ocb_dec_armv8_ce .type _gcry_aes_ocb_dec_armv8_ce,%function; _gcry_aes_ocb_dec_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: offset * %st+0: checksum => r4 * %st+4: Ls => r5 * %st+8: nblocks => r6 (0 < nblocks <= 32) * %st+12: nrounds => r7 * %st+16: blkn => lr */ vpush {q4-q7} push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ ldr r7, [sp, #(104+12)] ldr r4, [sp, #(104+0)] ldr r5, [sp, #(104+4)] ldr r6, [sp, #(104+8)] ldr lr, [sp, #(104+16)] cmp r7, #12 vld1.8 {q0}, [r3] /* load offset */ aes_preload_keys(r0, r12); beq .Locb_dec_entry_192 bhi .Locb_dec_entry_256 #define OCB_DEC(bits, ...) \ .Locb_dec_entry_##bits: \ cmp r6, #4; \ add lr, #1; \ blo .Locb_dec_loop_##bits; \ \ .Locb_dec_loop4_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ \ add r9, lr, #1; \ add r10, lr, #2; \ add r11, lr, #3; \ rbit r8, lr; \ add lr, lr, #4; \ rbit r9, r9; \ rbit r10, r10; \ rbit r11, r11; \ clz r8, r8; /* ntz(i+0) */ \ clz r9, r9; /* ntz(i+1) */ \ clz r10, r10; /* ntz(i+2) */ \ clz r11, r11; /* ntz(i+3) */ \ add r8, r5, r8, lsl #4; \ add r9, r5, r9, lsl #4; \ add r10, r5, r10, lsl #4; \ add r11, r5, r11, lsl #4; \ \ sub r6, #4; \ \ vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \ vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \ veor q0, q0, q9; /* Offset_i+0 */ \ vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \ veor q1, q1, q0; /* P_i+0 xor Offset_i+0 */\ vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \ vst1.8 {q0}, [r1]!; /* store Offset_i+0 */\ veor q0, q0, q9; /* Offset_i+1 */ \ vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \ veor q2, q2, q0; /* P_i+1 xor Offset_i+1 */\ vst1.8 {q0}, [r1]!; /* store Offset_i+1 */\ veor q0, q0, q9; /* Offset_i+2 */ \ vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \ veor q3, q3, q0; /* P_i+2 xor Offset_i+2 */\ vst1.8 {q0}, [r1]!; /* store Offset_i+2 */\ veor q0, q0, q9; /* Offset_i+3 */ \ veor q4, q4, q0; /* P_i+3 xor Offset_i+3 */\ vst1.8 {q0}, [r1]; /* store Offset_i+3 */\ sub r1, #(3*16); \ \ cmp r6, #4; \ \ do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \ \ mov r8, r1; \ vld1.8 {q8-q9}, [r1]!; \ veor q1, q1, q8; \ veor q2, q2, q9; \ vld1.8 {q8-q9}, [r1]!; \ vst1.8 {q1-q2}, [r8]!; \ veor q1, q1, q2; \ vld1.8 {q2}, [r4]; /* load Checksum_{i-1} */ \ veor q3, q3, q8; \ veor q1, q1, q3; \ veor q4, q4, q9; \ veor q1, q1, q4; \ vst1.8 {q3-q4}, [r8]; \ veor q2, q2, q1; \ vst1.8 {q2}, [r4]; /* store Checksum_i+3 */ \ \ bhs .Locb_dec_loop4_##bits; \ cmp r6, #0; \ beq .Locb_dec_done; \ \ .Locb_dec_loop_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ \ rbit r8, lr; \ add lr, #1; \ clz r8, r8; /* ntz(i) */ \ add r8, r5, r8, lsl #4; \ \ vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \ vld1.8 {q1}, [r2]!; /* load ciphertext */ \ subs r6, #1; \ veor q0, q0, q2; \ veor q1, q1, q0; \ \ do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__) \ \ vld1.8 {q2}, [r4]; /* load checksum */ \ veor q1, q1, q0; \ vst1.8 {q1}, [r1]!; /* store plaintext */ \ veor q2, q2, q1; \ vst1.8 {q2}, [r4]; /* store checksum */ \ \ bne .Locb_dec_loop_##bits; \ b .Locb_dec_done; OCB_DEC(128re, r0, r12) OCB_DEC(192, r0, r12) OCB_DEC(256, r0, r12) #undef OCB_DEC .Locb_dec_done: vst1.8 {q0}, [r3] /* store offset */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) pop {r4-r12,lr} vpop {q4-q7} bx lr .size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce; /* * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, * const unsigned char *abuf, * unsigned char *offset, * unsigned char *checksum, * unsigned char *L_table, * size_t nblocks, * unsigned int nrounds, * unsigned int blkn); */ .align 3 .globl _gcry_aes_ocb_auth_armv8_ce .type _gcry_aes_ocb_auth_armv8_ce,%function; _gcry_aes_ocb_auth_armv8_ce: /* input: * r0: keysched * r1: abuf * r2: offset * r3: checksum * %st+0: Ls => r5 * %st+4: nblocks => r6 (0 < nblocks <= 32) * %st+8: nrounds => r7 * %st+12: blkn => lr */ vpush {q4-q7} push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ ldr r7, [sp, #(104+8)] ldr r5, [sp, #(104+0)] ldr r6, [sp, #(104+4)] ldr lr, [sp, #(104+12)] cmp r7, #12 vld1.8 {q0}, [r2] /* load offset */ aes_preload_keys(r0, r12); beq .Locb_auth_entry_192 bhi .Locb_auth_entry_256 #define OCB_AUTH(bits, ...) \ .Locb_auth_entry_##bits: \ cmp r6, #4; \ add lr, #1; \ blo .Locb_auth_loop_##bits; \ \ .Locb_auth_loop4_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \ \ add r9, lr, #1; \ add r10, lr, #2; \ add r11, lr, #3; \ rbit r8, lr; \ add lr, lr, #4; \ rbit r9, r9; \ rbit r10, r10; \ rbit r11, r11; \ clz r8, r8; /* ntz(i+0) */ \ clz r9, r9; /* ntz(i+1) */ \ clz r10, r10; /* ntz(i+2) */ \ clz r11, r11; /* ntz(i+3) */ \ add r8, r5, r8, lsl #4; \ add r9, r5, r9, lsl #4; \ add r10, r5, r10, lsl #4; \ add r11, r5, r11, lsl #4; \ \ sub r6, #4; \ \ vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \ vld1.8 {q1-q2}, [r1]!; /* load A_i+<0-1> */ \ veor q0, q0, q9; /* Offset_i+0 */ \ vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \ veor q1, q1, q0; /* A_i+0 xor Offset_i+0 */\ vld1.8 {q3-q4}, [r1]!; /* load A_i+<2-3> */ \ veor q0, q0, q9; /* Offset_i+1 */ \ vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \ veor q2, q2, q0; /* A_i+1 xor Offset_i+1 */\ veor q0, q0, q9; /* Offset_i+2 */ \ vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \ veor q3, q3, q0; /* A_i+2 xor Offset_i+2 */\ veor q0, q0, q9; /* Offset_i+3 */ \ veor q4, q4, q0; /* A_i+3 xor Offset_i+3 */\ \ cmp r6, #4; \ \ do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ \ veor q1, q1, q2; \ veor q3, q3, q4; \ vld1.8 {q2}, [r3]; \ veor q1, q1, q3; \ veor q2, q2, q1; \ vst1.8 {q2}, [r3]; \ \ bhs .Locb_auth_loop4_##bits; \ cmp r6, #0; \ beq .Locb_auth_done; \ \ .Locb_auth_loop_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \ \ rbit r8, lr; \ add lr, #1; \ clz r8, r8; /* ntz(i) */ \ add r8, r5, r8, lsl #4; \ \ vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \ vld1.8 {q1}, [r1]!; /* load aadtext */ \ subs r6, #1; \ veor q0, q0, q2; \ vld1.8 {q2}, [r3]; /* load checksum */ \ veor q1, q1, q0; \ \ do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__) \ \ veor q2, q2, q1; \ vst1.8 {q2}, [r3]; /* store checksum */ \ \ bne .Locb_auth_loop_##bits; \ b .Locb_auth_done; OCB_AUTH(128re, r0, r12) OCB_AUTH(192, r0, r12) OCB_AUTH(256, r0, r12) #undef OCB_AUTH .Locb_auth_done: vst1.8 {q0}, [r2] /* store offset */ CLEAR_REG(q0) CLEAR_REG(q1) CLEAR_REG(q2) CLEAR_REG(q3) CLEAR_REG(q8) CLEAR_REG(q9) CLEAR_REG(q10) CLEAR_REG(q11) CLEAR_REG(q12) CLEAR_REG(q13) CLEAR_REG(q14) pop {r4-r12,lr} vpop {q4-q7} bx lr .size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce; + +/* + * void _gcry_aes_xts_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_xts_enc_armv8_ce +.type _gcry_aes_xts_enc_armv8_ce,%function; +_gcry_aes_xts_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + cmp r4, #0 + beq .Lxts_enc_skip + + cmp r5, #12 + + vld1.8 {q0}, [r3] /* load tweak */ + mov r7, #0x87; + + aes_preload_keys(r0, r6); + + beq .Lxts_enc_entry_192 + bhi .Lxts_enc_entry_256 + +#define CTR_XTS(bits, ...) \ + .Lxts_enc_entry_##bits: \ + cmp r4, #4; \ + blo .Lxts_enc_loop_##bits; \ + \ + .Lxts_enc_loop4_##bits: \ + sub r4, r4, #4; \ + veor q9, q9, q9; \ + \ + vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \ + veor q1, q1, q0; \ + cmp r4, #4; \ + vmov.u32 d18[0], r7; \ + vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \ + veor q2, q2, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q3, q3, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q4, q4, q0; \ + vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \ + sub r1, r1, #48; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \ + veor q1, q1, q8; \ + veor q2, q2, q9; \ + vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \ + sub r1, r1, #32; \ + veor q3, q3, q8; \ + veor q4, q4, q9; \ + vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lxts_enc_loop4_##bits; \ + cmp r4, #0; \ + beq .Lxts_enc_done; \ + \ + .Lxts_enc_loop_##bits: \ + \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + \ + veor q9, q9, q9; \ + veor q1, q1, q0; \ + vmov.u32 d18[0], r7; \ + vmov q2, q0; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + subs r4, r4, #1; \ + \ + do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q1, q2; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + \ + bne .Lxts_enc_loop_##bits; \ + b .Lxts_enc_done; + + CTR_XTS(128re, r0, r6) + CTR_XTS(192, r0, r6) + CTR_XTS(256, r0, r6) + +#undef CTR_XTS + +.Lxts_enc_done: + vst1.8 {q0}, [r3] /* store tweak */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lxts_enc_skip: + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce; + + +/* + * void _gcry_aes_xts_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_xts_dec_armv8_ce +.type _gcry_aes_xts_dec_armv8_ce,%function; +_gcry_aes_xts_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + cmp r4, #0 + beq .Lxts_dec_skip + + cmp r5, #12 + + vld1.8 {q0}, [r3] /* load tweak */ + mov r7, #0x87; + + aes_preload_keys(r0, r6); + + beq .Lxts_dec_entry_192 + bhi .Lxts_dec_entry_256 + +#define CTR_XTS(bits, ...) \ + .Lxts_dec_entry_##bits: \ + cmp r4, #4; \ + blo .Lxts_dec_loop_##bits; \ + \ + .Lxts_dec_loop4_##bits: \ + sub r4, r4, #4; \ + veor q9, q9, q9; \ + \ + vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \ + veor q1, q1, q0; \ + cmp r4, #4; \ + vmov.u32 d18[0], r7; \ + vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \ + veor q2, q2, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q3, q3, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q4, q4, q0; \ + vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \ + sub r1, r1, #48; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \ + veor q1, q1, q8; \ + veor q2, q2, q9; \ + vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \ + sub r1, r1, #32; \ + veor q3, q3, q8; \ + veor q4, q4, q9; \ + vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lxts_dec_loop4_##bits; \ + cmp r4, #0; \ + beq .Lxts_dec_done; \ + \ + .Lxts_dec_loop_##bits: \ + \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + \ + veor q9, q9, q9; \ + veor q1, q1, q0; \ + vmov.u32 d18[0], r7; \ + vmov q2, q0; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + subs r4, r4, #1; \ + \ + do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q1, q2; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + \ + bne .Lxts_dec_loop_##bits; \ + b .Lxts_dec_done; + + CTR_XTS(128re, r0, r6) + CTR_XTS(192, r0, r6) + CTR_XTS(256, r0, r6) + +#undef CTR_XTS + +.Lxts_dec_done: + vst1.8 {q0}, [r3] /* store tweak */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lxts_dec_skip: + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce; + + /* * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b); */ .align 3 .globl _gcry_aes_sbox4_armv8_ce .type _gcry_aes_sbox4_armv8_ce,%function; _gcry_aes_sbox4_armv8_ce: /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in * Cryptology — CT-RSA 2015" for details. */ vmov.i8 q0, #0x52 vmov.i8 q1, #0 vmov s0, r0 aese.8 q0, q1 veor d0, d1 vpadd.i32 d0, d0, d1 vmov r0, s0 CLEAR_REG(q0) bx lr .size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce; /* * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src); */ .align 3 .globl _gcry_aes_invmixcol_armv8_ce .type _gcry_aes_invmixcol_armv8_ce,%function; _gcry_aes_invmixcol_armv8_ce: vld1.8 {q0}, [r1] aesimc.8 q0, q0 vst1.8 {q0}, [r0] CLEAR_REG(q0) bx lr .size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce; #endif diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index 708ef340..40097a71 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -1,1314 +1,1588 @@ /* rijndael-armv8-aarch64-ce.S - ARMv8/CE accelerated AES * Copyright (C) 2016 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #if defined(__AARCH64EL__) && \ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) .cpu generic+simd+crypto .text #define GET_DATA_POINTER(reg, name) \ adrp reg, :got:name ; \ ldr reg, [reg, #:got_lo12:name] ; /* Register macros */ #define vk0 v17 #define vk1 v18 #define vk2 v19 #define vk3 v20 #define vk4 v21 #define vk5 v22 #define vk6 v23 #define vk7 v24 #define vk8 v25 #define vk9 v26 #define vk10 v27 #define vk11 v28 #define vk12 v29 #define vk13 v30 #define vk14 v31 /* AES macros */ #define aes_preload_keys(keysched, nrounds) \ cmp nrounds, #12; \ ld1 {vk0.16b-vk3.16b}, [keysched], #64; \ ld1 {vk4.16b-vk7.16b}, [keysched], #64; \ ld1 {vk8.16b-vk10.16b}, [keysched], #48; \ b.lo 1f; \ ld1 {vk11.16b-vk12.16b}, [keysched], #32; \ b.eq 1f; \ ld1 {vk13.16b-vk14.16b}, [keysched]; \ 1: ; #define do_aes_one128(ed, mcimc, vo, vb) \ aes##ed vb.16b, vk0.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk1.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk2.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk3.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk4.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk5.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk6.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk7.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk8.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk9.16b; \ eor vo.16b, vb.16b, vk10.16b; #define do_aes_one192(ed, mcimc, vo, vb) \ aes##ed vb.16b, vk0.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk1.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk2.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk3.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk4.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk5.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk6.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk7.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk8.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk9.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk10.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk11.16b; \ eor vo.16b, vb.16b, vk12.16b; #define do_aes_one256(ed, mcimc, vo, vb) \ aes##ed vb.16b, vk0.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk1.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk2.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk3.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk4.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk5.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk6.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk7.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk8.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk9.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk10.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk11.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk12.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk13.16b; \ eor vo.16b, vb.16b, vk14.16b; #define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \ aes##ed b0.16b, key.16b; \ aes##mcimc b0.16b, b0.16b; \ aes##ed b1.16b, key.16b; \ aes##mcimc b1.16b, b1.16b; \ aes##ed b2.16b, key.16b; \ aes##mcimc b2.16b, b2.16b; \ aes##ed b3.16b, key.16b; \ aes##mcimc b3.16b, b3.16b; #define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \ aes##ed b0.16b, key1.16b; \ eor b0.16b, b0.16b, key2.16b; \ aes##ed b1.16b, key1.16b; \ eor b1.16b, b1.16b, key2.16b; \ aes##ed b2.16b, key1.16b; \ eor b2.16b, b2.16b, key2.16b; \ aes##ed b3.16b, key1.16b; \ eor b3.16b, b3.16b, key2.16b; #define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \ aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10); #define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \ aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12); #define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \ aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14); /* Other functional macros */ #define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; #define aes_clear_keys(nrounds) \ cmp nrounds, #12; \ CLEAR_REG(vk0); \ CLEAR_REG(vk1); \ CLEAR_REG(vk2); \ CLEAR_REG(vk3); \ CLEAR_REG(vk4); \ CLEAR_REG(vk5); \ CLEAR_REG(vk6); \ CLEAR_REG(vk7); \ CLEAR_REG(vk9); \ CLEAR_REG(vk8); \ CLEAR_REG(vk10); \ b.lo 1f; \ CLEAR_REG(vk11); \ CLEAR_REG(vk12); \ b.eq 1f; \ CLEAR_REG(vk13); \ CLEAR_REG(vk14); \ 1: ; /* * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst, * const byte *src, * unsigned int nrounds); */ .align 3 .globl _gcry_aes_enc_armv8_ce .type _gcry_aes_enc_armv8_ce,%function; _gcry_aes_enc_armv8_ce: /* input: * x0: keysched * x1: dst * x2: src * w3: nrounds */ aes_preload_keys(x0, w3); ld1 {v0.16b}, [x2] b.hi .Lenc1_256 b.eq .Lenc1_192 .Lenc1_128: do_aes_one128(e, mc, v0, v0); .Lenc1_tail: CLEAR_REG(vk0) CLEAR_REG(vk1) CLEAR_REG(vk2) CLEAR_REG(vk3) CLEAR_REG(vk4) CLEAR_REG(vk5) CLEAR_REG(vk6) CLEAR_REG(vk7) CLEAR_REG(vk8) CLEAR_REG(vk9) CLEAR_REG(vk10) st1 {v0.16b}, [x1] CLEAR_REG(v0) mov x0, #0 ret .Lenc1_192: do_aes_one192(e, mc, v0, v0); CLEAR_REG(vk11) CLEAR_REG(vk12) b .Lenc1_tail .Lenc1_256: do_aes_one256(e, mc, v0, v0); CLEAR_REG(vk11) CLEAR_REG(vk12) CLEAR_REG(vk13) CLEAR_REG(vk14) b .Lenc1_tail .size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce; /* * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst, * const byte *src, * unsigned int nrounds); */ .align 3 .globl _gcry_aes_dec_armv8_ce .type _gcry_aes_dec_armv8_ce,%function; _gcry_aes_dec_armv8_ce: /* input: * x0: keysched * x1: dst * x2: src * w3: nrounds */ aes_preload_keys(x0, w3); ld1 {v0.16b}, [x2] b.hi .Ldec1_256 b.eq .Ldec1_192 .Ldec1_128: do_aes_one128(d, imc, v0, v0); .Ldec1_tail: CLEAR_REG(vk0) CLEAR_REG(vk1) CLEAR_REG(vk2) CLEAR_REG(vk3) CLEAR_REG(vk4) CLEAR_REG(vk5) CLEAR_REG(vk6) CLEAR_REG(vk7) CLEAR_REG(vk8) CLEAR_REG(vk9) CLEAR_REG(vk10) st1 {v0.16b}, [x1] CLEAR_REG(v0) mov x0, #0 ret .Ldec1_192: do_aes_one192(d, imc, v0, v0); CLEAR_REG(vk11) CLEAR_REG(vk12) b .Ldec1_tail .Ldec1_256: do_aes_one256(d, imc, v0, v0); CLEAR_REG(vk11) CLEAR_REG(vk12) CLEAR_REG(vk13) CLEAR_REG(vk14) b .Ldec1_tail .size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce; /* * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, size_t nblocks, * int cbc_mac, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cbc_enc_armv8_ce .type _gcry_aes_cbc_enc_armv8_ce,%function; _gcry_aes_cbc_enc_armv8_ce: /* input: * x0: keysched * x1: outbuf * x2: inbuf * x3: iv * x4: nblocks * w5: cbc_mac * w6: nrounds */ cbz x4, .Lcbc_enc_skip cmp w5, #0 ld1 {v1.16b}, [x3] /* load IV */ cset x5, eq aes_preload_keys(x0, w6); lsl x5, x5, #4 b.eq .Lcbc_enc_loop192 b.hi .Lcbc_enc_loop256 #define CBC_ENC(bits) \ .Lcbc_enc_loop##bits: \ ld1 {v0.16b}, [x2], #16; /* load plaintext */ \ eor v1.16b, v0.16b, v1.16b; \ sub x4, x4, #1; \ \ do_aes_one##bits(e, mc, v1, v1); \ \ st1 {v1.16b}, [x1], x5; /* store ciphertext */ \ \ cbnz x4, .Lcbc_enc_loop##bits; \ b .Lcbc_enc_done; CBC_ENC(128) CBC_ENC(192) CBC_ENC(256) #undef CBC_ENC .Lcbc_enc_done: aes_clear_keys(w6) st1 {v1.16b}, [x3] /* store IV */ CLEAR_REG(v1) CLEAR_REG(v0) .Lcbc_enc_skip: ret .size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce; /* * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cbc_dec_armv8_ce .type _gcry_aes_cbc_dec_armv8_ce,%function; _gcry_aes_cbc_dec_armv8_ce: /* input: * x0: keysched * x1: outbuf * x2: inbuf * x3: iv * x4: nblocks * w5: nrounds */ cbz x4, .Lcbc_dec_skip ld1 {v0.16b}, [x3] /* load IV */ aes_preload_keys(x0, w5); b.eq .Lcbc_dec_entry_192 b.hi .Lcbc_dec_entry_256 #define CBC_DEC(bits) \ .Lcbc_dec_entry_##bits: \ cmp x4, #4; \ b.lo .Lcbc_dec_loop_##bits; \ \ .Lcbc_dec_loop4_##bits: \ \ ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \ sub x4, x4, #4; \ mov v5.16b, v1.16b; \ mov v6.16b, v2.16b; \ mov v7.16b, v3.16b; \ mov v16.16b, v4.16b; \ cmp x4, #4; \ \ do_aes_4_##bits(d, imc, v1, v2, v3, v4); \ \ eor v1.16b, v1.16b, v0.16b; \ eor v2.16b, v2.16b, v5.16b; \ st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \ eor v3.16b, v3.16b, v6.16b; \ eor v4.16b, v4.16b, v7.16b; \ mov v0.16b, v16.16b; /* next IV */ \ st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \ \ b.hs .Lcbc_dec_loop4_##bits; \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ CLEAR_REG(v16); \ cbz x4, .Lcbc_dec_done; \ \ .Lcbc_dec_loop_##bits: \ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \ sub x4, x4, #1; \ mov v2.16b, v1.16b; \ \ do_aes_one##bits(d, imc, v1, v1); \ \ eor v1.16b, v1.16b, v0.16b; \ mov v0.16b, v2.16b; \ st1 {v1.16b}, [x1], #16; /* store plaintext */ \ \ cbnz x4, .Lcbc_dec_loop_##bits; \ b .Lcbc_dec_done; CBC_DEC(128) CBC_DEC(192) CBC_DEC(256) #undef CBC_DEC .Lcbc_dec_done: aes_clear_keys(w5) st1 {v0.16b}, [x3] /* store IV */ CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) .Lcbc_dec_skip: ret .size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce; /* * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_ctr_enc_armv8_ce .type _gcry_aes_ctr_enc_armv8_ce,%function; _gcry_aes_ctr_enc_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * x4: nblocks * w5: nrounds */ cbz x4, .Lctr_enc_skip mov x6, #1 movi v16.16b, #0 mov v16.D[1], x6 /* load IV */ ldp x9, x10, [x3] ld1 {v0.16b}, [x3] rev x9, x9 rev x10, x10 aes_preload_keys(x0, w5); b.eq .Lctr_enc_entry_192 b.hi .Lctr_enc_entry_256 #define CTR_ENC(bits) \ .Lctr_enc_entry_##bits: \ cmp x4, #4; \ b.lo .Lctr_enc_loop_##bits; \ \ .Lctr_enc_loop4_##bits: \ cmp x10, #0xfffffffffffffffc; \ sub x4, x4, #4; \ b.lo .Lctr_enc_loop4_##bits##_nocarry; \ \ adds x10, x10, #1; \ mov v1.16b, v0.16b; \ adc x9, x9, xzr; \ mov v2.D[1], x10; \ mov v2.D[0], x9; \ \ adds x10, x10, #1; \ rev64 v2.16b, v2.16b; \ adc x9, x9, xzr; \ mov v3.D[1], x10; \ mov v3.D[0], x9; \ \ adds x10, x10, #1; \ rev64 v3.16b, v3.16b; \ adc x9, x9, xzr; \ mov v4.D[1], x10; \ mov v4.D[0], x9; \ \ adds x10, x10, #1; \ rev64 v4.16b, v4.16b; \ adc x9, x9, xzr; \ mov v0.D[1], x10; \ mov v0.D[0], x9; \ rev64 v0.16b, v0.16b; \ \ b .Lctr_enc_loop4_##bits##_store_ctr; \ \ .Lctr_enc_loop4_##bits##_nocarry: \ \ add v3.2d, v16.2d, v16.2d; /* 2 */ \ rev64 v6.16b, v0.16b; \ add x10, x10, #4; \ add v4.2d, v3.2d, v16.2d; /* 3 */ \ add v0.2d, v3.2d, v3.2d; /* 4 */ \ rev64 v1.16b, v6.16b; \ add v2.2d, v6.2d, v16.2d; \ add v3.2d, v6.2d, v3.2d; \ add v4.2d, v6.2d, v4.2d; \ add v0.2d, v6.2d, v0.2d; \ rev64 v2.16b, v2.16b; \ rev64 v3.16b, v3.16b; \ rev64 v0.16b, v0.16b; \ rev64 v4.16b, v4.16b; \ \ .Lctr_enc_loop4_##bits##_store_ctr: \ \ st1 {v0.16b}, [x3]; \ cmp x4, #4; \ ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \ \ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ \ eor v1.16b, v1.16b, v5.16b; \ ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \ eor v2.16b, v2.16b, v6.16b; \ eor v3.16b, v3.16b, v7.16b; \ eor v4.16b, v4.16b, v5.16b; \ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ \ b.hs .Lctr_enc_loop4_##bits; \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ cbz x4, .Lctr_enc_done; \ \ .Lctr_enc_loop_##bits: \ \ adds x10, x10, #1; \ mov v1.16b, v0.16b; \ adc x9, x9, xzr; \ mov v0.D[1], x10; \ mov v0.D[0], x9; \ sub x4, x4, #1; \ ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \ rev64 v0.16b, v0.16b; \ \ do_aes_one##bits(e, mc, v1, v1); \ \ eor v1.16b, v2.16b, v1.16b; \ st1 {v1.16b}, [x1], #16; /* store plaintext */ \ \ cbnz x4, .Lctr_enc_loop_##bits; \ b .Lctr_enc_done; CTR_ENC(128) CTR_ENC(192) CTR_ENC(256) #undef CTR_ENC .Lctr_enc_done: aes_clear_keys(w5) st1 {v0.16b}, [x3] /* store IV */ CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) .Lctr_enc_skip: ret .size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce; /* * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cfb_enc_armv8_ce .type _gcry_aes_cfb_enc_armv8_ce,%function; _gcry_aes_cfb_enc_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * x4: nblocks * w5: nrounds */ cbz x4, .Lcfb_enc_skip /* load IV */ ld1 {v0.16b}, [x3] aes_preload_keys(x0, w5); b.eq .Lcfb_enc_entry_192 b.hi .Lcfb_enc_entry_256 #define CFB_ENC(bits) \ .Lcfb_enc_entry_##bits: \ .Lcfb_enc_loop_##bits: \ ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ sub x4, x4, #1; \ \ do_aes_one##bits(e, mc, v0, v0); \ \ eor v0.16b, v1.16b, v0.16b; \ st1 {v0.16b}, [x1], #16; /* store ciphertext */ \ \ cbnz x4, .Lcfb_enc_loop_##bits; \ b .Lcfb_enc_done; CFB_ENC(128) CFB_ENC(192) CFB_ENC(256) #undef CFB_ENC .Lcfb_enc_done: aes_clear_keys(w5) st1 {v0.16b}, [x3] /* store IV */ CLEAR_REG(v0) CLEAR_REG(v1) .Lcfb_enc_skip: ret .size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce; /* * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *iv, unsigned int nrounds); */ .align 3 .globl _gcry_aes_cfb_dec_armv8_ce .type _gcry_aes_cfb_dec_armv8_ce,%function; _gcry_aes_cfb_dec_armv8_ce: /* input: * r0: keysched * r1: outbuf * r2: inbuf * r3: iv * x4: nblocks * w5: nrounds */ cbz x4, .Lcfb_dec_skip /* load IV */ ld1 {v0.16b}, [x3] aes_preload_keys(x0, w5); b.eq .Lcfb_dec_entry_192 b.hi .Lcfb_dec_entry_256 #define CFB_DEC(bits) \ .Lcfb_dec_entry_##bits: \ cmp x4, #4; \ b.lo .Lcfb_dec_loop_##bits; \ \ .Lcfb_dec_loop4_##bits: \ \ ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \ mov v1.16b, v0.16b; \ sub x4, x4, #4; \ cmp x4, #4; \ mov v5.16b, v2.16b; \ mov v6.16b, v3.16b; \ mov v7.16b, v4.16b; \ ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \ \ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ \ eor v1.16b, v1.16b, v5.16b; \ eor v2.16b, v2.16b, v6.16b; \ eor v3.16b, v3.16b, v7.16b; \ eor v4.16b, v4.16b, v0.16b; \ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ \ b.hs .Lcfb_dec_loop4_##bits; \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ cbz x4, .Lcfb_dec_done; \ \ .Lcfb_dec_loop_##bits: \ \ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \ \ sub x4, x4, #1; \ \ do_aes_one##bits(e, mc, v0, v0); \ \ eor v2.16b, v1.16b, v0.16b; \ mov v0.16b, v1.16b; \ st1 {v2.16b}, [x1], #16; /* store plaintext */ \ \ cbnz x4, .Lcfb_dec_loop_##bits; \ b .Lcfb_dec_done; CFB_DEC(128) CFB_DEC(192) CFB_DEC(256) #undef CFB_DEC .Lcfb_dec_done: aes_clear_keys(w5) st1 {v0.16b}, [x3] /* store IV */ CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) .Lcfb_dec_skip: ret .size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce; /* * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, * unsigned char *checksum, * unsigned char *L_table, * size_t nblocks, * unsigned int nrounds, * unsigned int blkn); */ .align 3 .globl _gcry_aes_ocb_enc_armv8_ce .type _gcry_aes_ocb_enc_armv8_ce,%function; _gcry_aes_ocb_enc_armv8_ce: /* input: * x0: keysched * x1: outbuf * x2: inbuf * x3: offset * x4: checksum * x5: Ltable * x6: nblocks (0 < nblocks <= 32) * w7: nrounds * %st+0: blkn => w12 */ ldr w12, [sp] ld1 {v0.16b}, [x3] /* load offset */ ld1 {v16.16b}, [x4] /* load checksum */ aes_preload_keys(x0, w7); b.eq .Locb_enc_entry_192 b.hi .Locb_enc_entry_256 #define OCB_ENC(bits, ...) \ .Locb_enc_entry_##bits: \ cmp x6, #4; \ add x12, x12, #1; \ b.lo .Locb_enc_loop_##bits; \ \ .Locb_enc_loop4_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ \ add w9, w12, #1; \ add w10, w12, #2; \ add w11, w12, #3; \ rbit w8, w12; \ add w12, w12, #4; \ rbit w9, w9; \ rbit w10, w10; \ rbit w11, w11; \ clz w8, w8; /* ntz(i+0) */ \ clz w9, w9; /* ntz(i+1) */ \ clz w10, w10; /* ntz(i+2) */ \ clz w11, w11; /* ntz(i+3) */ \ add x8, x5, x8, lsl #4; \ ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \ add x9, x5, x9, lsl #4; \ add x10, x5, x10, lsl #4; \ add x11, x5, x11, lsl #4; \ \ sub x6, x6, #4; \ \ ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \ eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \ ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \ eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \ ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \ eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \ eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \ ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \ eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \ eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \ eor v1.16b, v1.16b, v5.16b; /* P_i+0 xor Offset_i+0 */ \ eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \ eor v2.16b, v2.16b, v6.16b; /* P_i+1 xor Offset_i+1 */ \ eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \ cmp x6, #4; \ eor v3.16b, v3.16b, v7.16b; /* P_i+2 xor Offset_i+2 */ \ eor v4.16b, v4.16b, v0.16b; /* P_i+3 xor Offset_i+3 */ \ \ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ \ eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \ eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \ eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \ eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \ st1 {v1.16b-v4.16b}, [x1], #64; \ \ b.hs .Locb_enc_loop4_##bits; \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ cbz x6, .Locb_enc_done; \ \ .Locb_enc_loop_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ \ rbit x8, x12; \ add x12, x12, #1; \ clz x8, x8; /* ntz(i) */ \ add x8, x5, x8, lsl #4; \ \ ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \ sub x6, x6, #1; \ eor v0.16b, v0.16b, v2.16b; \ eor v16.16b, v16.16b, v1.16b; \ eor v1.16b, v1.16b, v0.16b; \ \ do_aes_one##bits(e, mc, v1, v1); \ \ eor v1.16b, v1.16b, v0.16b; \ st1 {v1.16b}, [x1], #16; /* store ciphertext */ \ \ cbnz x6, .Locb_enc_loop_##bits; \ b .Locb_enc_done; OCB_ENC(128) OCB_ENC(192) OCB_ENC(256) #undef OCB_ENC .Locb_enc_done: aes_clear_keys(w7) st1 {v16.16b}, [x4] /* store checksum */ st1 {v0.16b}, [x3] /* store offset */ CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) CLEAR_REG(v16) ret .size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce; /* * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, * unsigned char *checksum, * unsigned char *L_table, * size_t nblocks, * unsigned int nrounds, * unsigned int blkn); */ .align 3 .globl _gcry_aes_ocb_dec_armv8_ce .type _gcry_aes_ocb_dec_armv8_ce,%function; _gcry_aes_ocb_dec_armv8_ce: /* input: * x0: keysched * x1: outbuf * x2: inbuf * x3: offset * x4: checksum * x5: Ltable * x6: nblocks (0 < nblocks <= 32) * w7: nrounds * %st+0: blkn => w12 */ ldr w12, [sp] ld1 {v0.16b}, [x3] /* load offset */ ld1 {v16.16b}, [x4] /* load checksum */ aes_preload_keys(x0, w7); b.eq .Locb_dec_entry_192 b.hi .Locb_dec_entry_256 #define OCB_DEC(bits) \ .Locb_dec_entry_##bits: \ cmp x6, #4; \ add w12, w12, #1; \ b.lo .Locb_dec_loop_##bits; \ \ .Locb_dec_loop4_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ \ add w9, w12, #1; \ add w10, w12, #2; \ add w11, w12, #3; \ rbit w8, w12; \ add w12, w12, #4; \ rbit w9, w9; \ rbit w10, w10; \ rbit w11, w11; \ clz w8, w8; /* ntz(i+0) */ \ clz w9, w9; /* ntz(i+1) */ \ clz w10, w10; /* ntz(i+2) */ \ clz w11, w11; /* ntz(i+3) */ \ add x8, x5, x8, lsl #4; \ ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \ add x9, x5, x9, lsl #4; \ add x10, x5, x10, lsl #4; \ add x11, x5, x11, lsl #4; \ \ sub x6, x6, #4; \ \ ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \ ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \ ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \ eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \ ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \ eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \ eor v1.16b, v1.16b, v5.16b; /* C_i+0 xor Offset_i+0 */ \ eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \ eor v2.16b, v2.16b, v6.16b; /* C_i+1 xor Offset_i+1 */ \ eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \ cmp x6, #4; \ eor v3.16b, v3.16b, v7.16b; /* C_i+2 xor Offset_i+2 */ \ eor v4.16b, v4.16b, v0.16b; /* C_i+3 xor Offset_i+3 */ \ \ do_aes_4_##bits(d, imc, v1, v2, v3, v4); \ \ eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \ eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \ eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \ eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \ eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \ eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \ eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \ eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \ st1 {v1.16b-v4.16b}, [x1], #64; \ \ b.hs .Locb_dec_loop4_##bits; \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ cbz x6, .Locb_dec_done; \ \ .Locb_dec_loop_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ \ rbit w8, w12; \ add w12, w12, #1; \ clz w8, w8; /* ntz(i) */ \ add x8, x5, x8, lsl #4; \ \ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \ ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \ sub x6, x6, #1; \ eor v0.16b, v0.16b, v2.16b; \ eor v1.16b, v1.16b, v0.16b; \ \ do_aes_one##bits(d, imc, v1, v1) \ \ eor v1.16b, v1.16b, v0.16b; \ st1 {v1.16b}, [x1], #16; /* store plaintext */ \ eor v16.16b, v16.16b, v1.16b; \ \ cbnz x6, .Locb_dec_loop_##bits; \ b .Locb_dec_done; OCB_DEC(128) OCB_DEC(192) OCB_DEC(256) #undef OCB_DEC .Locb_dec_done: aes_clear_keys(w7) st1 {v16.16b}, [x4] /* store checksum */ st1 {v0.16b}, [x3] /* store offset */ CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) CLEAR_REG(v16) ret .size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce; /* * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, * const unsigned char *abuf, * unsigned char *offset, * unsigned char *checksum, * unsigned char *L_table, * size_t nblocks, * unsigned int nrounds, * unsigned int blkn); */ .align 3 .globl _gcry_aes_ocb_auth_armv8_ce .type _gcry_aes_ocb_auth_armv8_ce,%function; _gcry_aes_ocb_auth_armv8_ce: /* input: * x0: keysched * x1: abuf * x2: offset => x3 * x3: checksum => x4 * x4: Ltable => x5 * x5: nblocks => x6 (0 < nblocks <= 32) * w6: nrounds => w7 * w7: blkn => w12 */ mov x12, x7 mov x7, x6 mov x6, x5 mov x5, x4 mov x4, x3 mov x3, x2 aes_preload_keys(x0, w7); ld1 {v0.16b}, [x3] /* load offset */ ld1 {v16.16b}, [x4] /* load checksum */ beq .Locb_auth_entry_192 bhi .Locb_auth_entry_256 #define OCB_AUTH(bits) \ .Locb_auth_entry_##bits: \ cmp x6, #4; \ add w12, w12, #1; \ b.lo .Locb_auth_loop_##bits; \ \ .Locb_auth_loop4_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \ \ add w9, w12, #1; \ add w10, w12, #2; \ add w11, w12, #3; \ rbit w8, w12; \ add w12, w12, #4; \ rbit w9, w9; \ rbit w10, w10; \ rbit w11, w11; \ clz w8, w8; /* ntz(i+0) */ \ clz w9, w9; /* ntz(i+1) */ \ clz w10, w10; /* ntz(i+2) */ \ clz w11, w11; /* ntz(i+3) */ \ add x8, x5, x8, lsl #4; \ ld1 {v1.16b-v4.16b}, [x1], #64; /* load A_i+<0-3> */ \ add x9, x5, x9, lsl #4; \ add x10, x5, x10, lsl #4; \ add x11, x5, x11, lsl #4; \ \ sub x6, x6, #4; \ \ ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \ ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \ ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \ eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \ ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \ eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \ eor v1.16b, v1.16b, v5.16b; /* A_i+0 xor Offset_i+0 */ \ eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \ eor v2.16b, v2.16b, v6.16b; /* A_i+1 xor Offset_i+1 */ \ eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \ cmp x6, #4; \ eor v3.16b, v3.16b, v7.16b; /* A_i+2 xor Offset_i+2 */ \ eor v4.16b, v4.16b, v0.16b; /* A_i+3 xor Offset_i+3 */ \ \ do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ \ eor v1.16b, v1.16b, v2.16b; \ eor v16.16b, v16.16b, v3.16b; \ eor v1.16b, v1.16b, v4.16b; \ eor v16.16b, v16.16b, v1.16b; \ \ b.hs .Locb_auth_loop4_##bits; \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ cbz x6, .Locb_auth_done; \ \ .Locb_auth_loop_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \ \ rbit w8, w12; \ add w12, w12, #1; \ clz w8, w8; /* ntz(i) */ \ add x8, x5, x8, lsl #4; \ \ ld1 {v1.16b}, [x1], #16; /* load aadtext */ \ ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \ sub x6, x6, #1; \ eor v0.16b, v0.16b, v2.16b; \ eor v1.16b, v1.16b, v0.16b; \ \ do_aes_one##bits(e, mc, v1, v1) \ \ eor v16.16b, v16.16b, v1.16b; \ \ cbnz x6, .Locb_auth_loop_##bits; \ b .Locb_auth_done; OCB_AUTH(128) OCB_AUTH(192) OCB_AUTH(256) #undef OCB_AUTH .Locb_auth_done: aes_clear_keys(w7) st1 {v16.16b}, [x4] /* store checksum */ st1 {v0.16b}, [x3] /* store offset */ CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) CLEAR_REG(v16) ret .size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce; +/* + * void _gcry_aes_xts_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *tweak, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_xts_enc_armv8_ce +.type _gcry_aes_xts_enc_armv8_ce,%function; +_gcry_aes_xts_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: tweak + * x4: nblocks + * w5: nrounds + */ + + cbz x4, .Lxts_enc_skip + + /* load tweak */ + ld1 {v0.16b}, [x3] + + /* load gfmul mask */ + mov x6, #0x87 + mov x7, #0x01 + mov v16.D[0], x6 + mov v16.D[1], x7 + + aes_preload_keys(x0, w5); + + b.eq .Lxts_enc_entry_192 + b.hi .Lxts_enc_entry_256 + +#define XTS_ENC(bits) \ + .Lxts_enc_entry_##bits: \ + cmp x4, #4; \ + b.lo .Lxts_enc_loop_##bits; \ + \ + .Lxts_enc_loop4_##bits: \ + \ + ext v4.16b, v0.16b, v0.16b, #8; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v5.2d, v0.2d, v0.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v5.16b, v5.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v6.2d, v5.2d, v5.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v6.16b, v6.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v7.2d, v6.2d, v6.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v7.16b, v7.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v3.2d, v7.2d, v7.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v3.16b, v3.16b, v2.16b; \ + ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \ + st1 {v3.16b}, [x3]; \ + sub x4, x4, #4; \ + eor v1.16b, v1.16b, v0.16b; \ + \ + ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \ + cmp x4, #4; \ + eor v2.16b, v2.16b, v5.16b; \ + eor v3.16b, v3.16b, v6.16b; \ + eor v4.16b, v4.16b, v7.16b; \ + \ + do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ + \ + eor v1.16b, v1.16b, v0.16b; \ + ld1 {v0.16b}, [x3]; \ + eor v2.16b, v2.16b, v5.16b; \ + eor v3.16b, v3.16b, v6.16b; \ + eor v4.16b, v4.16b, v7.16b; \ + st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ + \ + b.hs .Lxts_enc_loop4_##bits; \ + CLEAR_REG(v3); \ + CLEAR_REG(v4); \ + CLEAR_REG(v5); \ + CLEAR_REG(v6); \ + CLEAR_REG(v7); \ + cbz x4, .Lxts_enc_done; \ + \ + .Lxts_enc_loop_##bits: \ + \ + ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ + ext v3.16b, v0.16b, v0.16b, #8; \ + mov v2.16b, v0.16b; \ + sshr v3.2d, v3.2d, #63; \ + add v0.2d, v0.2d, v0.2d; \ + and v3.16b, v3.16b, v16.16b; \ + eor v1.16b, v1.16b, v2.16b; \ + eor v0.16b, v0.16b, v3.16b; \ + sub x4, x4, #1; \ + \ + do_aes_one##bits(e, mc, v1, v1); \ + \ + eor v1.16b, v1.16b, v2.16b; \ + st1 {v1.16b}, [x1], #16; /* store ciphertext */ \ + \ + cbnz x4, .Lxts_enc_loop_##bits; \ + b .Lxts_enc_done; + + XTS_ENC(128) + XTS_ENC(192) + XTS_ENC(256) + +#undef XTS_ENC + +.Lxts_enc_done: + aes_clear_keys(w5) + + st1 {v0.16b}, [x3] /* store tweak */ + + CLEAR_REG(v0) + CLEAR_REG(v1) + CLEAR_REG(v2) + +.Lxts_enc_skip: + ret + +.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce; + + +/* + * void _gcry_aes_xts_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *tweak, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_xts_dec_armv8_ce +.type _gcry_aes_xts_dec_armv8_ce,%function; +_gcry_aes_xts_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: tweak + * x4: nblocks + * w5: nrounds + */ + + cbz x4, .Lxts_dec_skip + + /* load tweak */ + ld1 {v0.16b}, [x3] + + /* load gfmul mask */ + mov x6, #0x87 + mov x7, #0x01 + mov v16.D[0], x6 + mov v16.D[1], x7 + + aes_preload_keys(x0, w5); + + b.eq .Lxts_dec_entry_192 + b.hi .Lxts_dec_entry_256 + +#define XTS_DEC(bits) \ + .Lxts_dec_entry_##bits: \ + cmp x4, #4; \ + b.lo .Lxts_dec_loop_##bits; \ + \ + .Lxts_dec_loop4_##bits: \ + \ + ext v4.16b, v0.16b, v0.16b, #8; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v5.2d, v0.2d, v0.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v5.16b, v5.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v6.2d, v5.2d, v5.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v6.16b, v6.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v7.2d, v6.2d, v6.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v7.16b, v7.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v3.2d, v7.2d, v7.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v3.16b, v3.16b, v2.16b; \ + ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \ + st1 {v3.16b}, [x3]; \ + sub x4, x4, #4; \ + eor v1.16b, v1.16b, v0.16b; \ + \ + ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \ + cmp x4, #4; \ + eor v2.16b, v2.16b, v5.16b; \ + eor v3.16b, v3.16b, v6.16b; \ + eor v4.16b, v4.16b, v7.16b; \ + \ + do_aes_4_##bits(d, imc, v1, v2, v3, v4); \ + \ + eor v1.16b, v1.16b, v0.16b; \ + ld1 {v0.16b}, [x3]; \ + eor v2.16b, v2.16b, v5.16b; \ + eor v3.16b, v3.16b, v6.16b; \ + eor v4.16b, v4.16b, v7.16b; \ + st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ + \ + b.hs .Lxts_dec_loop4_##bits; \ + CLEAR_REG(v3); \ + CLEAR_REG(v4); \ + CLEAR_REG(v5); \ + CLEAR_REG(v6); \ + CLEAR_REG(v7); \ + cbz x4, .Lxts_dec_done; \ + \ + .Lxts_dec_loop_##bits: \ + \ + ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ + ext v3.16b, v0.16b, v0.16b, #8; \ + mov v2.16b, v0.16b; \ + sshr v3.2d, v3.2d, #63; \ + add v0.2d, v0.2d, v0.2d; \ + and v3.16b, v3.16b, v16.16b; \ + eor v1.16b, v1.16b, v2.16b; \ + eor v0.16b, v0.16b, v3.16b; \ + sub x4, x4, #1; \ + \ + do_aes_one##bits(d, imc, v1, v1); \ + \ + eor v1.16b, v1.16b, v2.16b; \ + st1 {v1.16b}, [x1], #16; /* store ciphertext */ \ + \ + cbnz x4, .Lxts_dec_loop_##bits; \ + b .Lxts_dec_done; + + XTS_DEC(128) + XTS_DEC(192) + XTS_DEC(256) + +#undef XTS_DEC + +.Lxts_dec_done: + aes_clear_keys(w5) + + st1 {v0.16b}, [x3] /* store tweak */ + + CLEAR_REG(v0) + CLEAR_REG(v1) + CLEAR_REG(v2) + +.Lxts_dec_skip: + ret + +.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce; + + /* * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b); */ .align 3 .globl _gcry_aes_sbox4_armv8_ce .type _gcry_aes_sbox4_armv8_ce,%function; _gcry_aes_sbox4_armv8_ce: /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in * Cryptology — CT-RSA 2015" for details. */ movi v0.16b, #0x52 movi v1.16b, #0 mov v0.S[0], w0 aese v0.16b, v1.16b addv s0, v0.4s mov w0, v0.S[0] CLEAR_REG(v0) ret .size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce; /* * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src); */ .align 3 .globl _gcry_aes_invmixcol_armv8_ce .type _gcry_aes_invmixcol_armv8_ce,%function; _gcry_aes_invmixcol_armv8_ce: ld1 {v0.16b}, [x1] aesimc v0.16b, v0.16b st1 {v0.16b}, [x0] CLEAR_REG(v0) ret .size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce; #endif diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c index 334cf684..6af7108f 100644 --- a/cipher/rijndael-armv8-ce.c +++ b/cipher/rijndael-armv8-ce.c @@ -1,364 +1,392 @@ /* ARMv8 Crypto Extension AES for Libgcrypt * Copyright (C) 2016 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_ARM_CE typedef struct u128_s { u32 a, b, c, d; } u128_t; extern u32 _gcry_aes_sbox4_armv8_ce(u32 in4b); extern void _gcry_aes_invmixcol_armv8_ce(u128_t *dst, const u128_t *src); extern unsigned int _gcry_aes_enc_armv8_ce(const void *keysched, byte *dst, const byte *src, unsigned int nrounds); extern unsigned int _gcry_aes_dec_armv8_ce(const void *keysched, byte *dst, const byte *src, unsigned int nrounds); extern void _gcry_aes_cbc_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, int cbc_mac, unsigned int nrounds); extern void _gcry_aes_cbc_dec_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_cfb_dec_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, unsigned int nrounds); extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, const unsigned char *abuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); +extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *tweak, + size_t nblocks, unsigned int nrounds); +extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *tweak, + size_t nblocks, unsigned int nrounds); typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *offset, unsigned char *checksum, unsigned char *L_table, size_t nblocks, unsigned int nrounds, unsigned int blkn); +typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *tweak, size_t nblocks, + unsigned int nrounds); + void _gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key) { union { PROPERLY_ALIGNED_TYPE dummy; byte data[MAXKC][4]; u32 data32[MAXKC]; } tkk[2]; unsigned int rounds = ctx->rounds; int KC = rounds - 6; unsigned int keylen = KC * 4; unsigned int i, r, t; byte rcon = 1; int j; #define k tkk[0].data #define k_u32 tkk[0].data32 #define tk tkk[1].data #define tk_u32 tkk[1].data32 #define W (ctx->keyschenc) #define W_u32 (ctx->keyschenc32) for (i = 0; i < keylen; i++) { k[i >> 2][i & 3] = key[i]; } for (j = KC-1; j >= 0; j--) { tk_u32[j] = k_u32[j]; } r = 0; t = 0; /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } while (r < rounds + 1) { tk_u32[0] ^= _gcry_aes_sbox4_armv8_ce(rol(tk_u32[KC - 1], 24)) ^ rcon; if (KC != 8) { for (j = 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } else { for (j = 1; j < KC/2; j++) { tk_u32[j] ^= tk_u32[j-1]; } tk_u32[KC/2] ^= _gcry_aes_sbox4_armv8_ce(tk_u32[KC/2 - 1]); for (j = KC/2 + 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b); } #undef W #undef tk #undef k #undef W_u32 #undef tk_u32 #undef k_u32 wipememory(&tkk, sizeof(tkk)); } /* Make a decryption key from an encryption key. */ void _gcry_aes_armv8_ce_prepare_decryption (RIJNDAEL_context *ctx) { u128_t *ekey = (u128_t *)(void *)ctx->keyschenc; u128_t *dkey = (u128_t *)(void *)ctx->keyschdec; int rounds = ctx->rounds; int rr; int r; #define DO_AESIMC() _gcry_aes_invmixcol_armv8_ce(&dkey[r], &ekey[rr]) dkey[0] = ekey[rounds]; r = 1; rr = rounds-1; DO_AESIMC(); r++; rr--; /* round 1 */ DO_AESIMC(); r++; rr--; /* round 2 */ DO_AESIMC(); r++; rr--; /* round 3 */ DO_AESIMC(); r++; rr--; /* round 4 */ DO_AESIMC(); r++; rr--; /* round 5 */ DO_AESIMC(); r++; rr--; /* round 6 */ DO_AESIMC(); r++; rr--; /* round 7 */ DO_AESIMC(); r++; rr--; /* round 8 */ DO_AESIMC(); r++; rr--; /* round 9 */ if (rounds >= 12) { if (rounds > 12) { DO_AESIMC(); r++; rr--; /* round 10 */ DO_AESIMC(); r++; rr--; /* round 11 */ } DO_AESIMC(); r++; rr--; /* round 12 / 10 */ DO_AESIMC(); r++; rr--; /* round 13 / 11 */ } dkey[r] = ekey[0]; #undef DO_AESIMC } unsigned int _gcry_aes_armv8_ce_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; return _gcry_aes_enc_armv8_ce(keysched, dst, src, nrounds); } unsigned int _gcry_aes_armv8_ce_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src) { const void *keysched = ctx->keyschdec32; unsigned int nrounds = ctx->rounds; return _gcry_aes_dec_armv8_ce(keysched, dst, src, nrounds); } void _gcry_aes_armv8_ce_cbc_enc (const RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, int cbc_mac) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_cbc_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, cbc_mac, nrounds); } void _gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks) { const void *keysched = ctx->keyschdec32; unsigned int nrounds = ctx->rounds; _gcry_aes_cbc_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } void _gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_cfb_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } void _gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_cfb_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } void _gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks) { const void *keysched = ctx->keyschenc32; unsigned int nrounds = ctx->rounds; _gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } void _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = (void *)&c->context.c; const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce : _gcry_aes_ocb_dec_armv8_ce; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int nrounds = ctx->rounds; u64 blkn = c->u_mode.ocb.data_nblocks; c->u_mode.ocb.data_nblocks = blkn + nblocks; crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn); } void _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const void *keysched = ctx->keyschenc32; const unsigned char *abuf = abuf_arg; unsigned int nrounds = ctx->rounds; u64 blkn = c->u_mode.ocb.aad_nblocks; c->u_mode.ocb.aad_nblocks = blkn + nblocks; _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset, c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn); } +void +_gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, + unsigned char *outbuf, const unsigned char *inbuf, + size_t nblocks, int encrypt) +{ + const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; + xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce + : _gcry_aes_xts_dec_armv8_ce; + unsigned int nrounds = ctx->rounds; + + crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds); +} + #endif /* USE_ARM_CE */ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index df1363f2..0f676fe1 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -1,2068 +1,2080 @@ /* Rijndael (AES) for GnuPG * Copyright (C) 2000, 2001, 2002, 2003, 2007, * 2008, 2011, 2012 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . ******************************************************************* * The code here is based on the optimized implementation taken from * http://www.esat.kuleuven.ac.be/~rijmen/rijndael/ on Oct 2, 2000, * which carries this notice: *------------------------------------------ * rijndael-alg-fst.c v2.3 April '2000 * * Optimised ANSI C code * * authors: v1.0: Antoon Bosselaers * v2.0: Vincent Rijmen * v2.3: Paulo Barreto * * This code is placed in the public domain. *------------------------------------------ * * The SP800-38a document is available at: * http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf * */ #include #include #include #include /* for memcmp() */ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" #include "./cipher-internal.h" #ifdef USE_AMD64_ASM /* AMD64 assembly implementations of AES */ extern unsigned int _gcry_aes_amd64_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, int rounds, const void *encT); extern unsigned int _gcry_aes_amd64_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, int rounds, const void *decT); #endif /*USE_AMD64_ASM*/ #ifdef USE_AESNI /* AES-NI (AMD64 & i386) accelerated implementations of AES */ extern void _gcry_aes_aesni_do_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, int cbc_mac); extern void _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *ctr, size_t nblocks); extern void _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern void _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); extern void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int encrypt); #endif #ifdef USE_SSSE3 /* SSSE3 (AMD64) vector permutation implementation of AES */ extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_ssse3_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, int cbc_mac); extern void _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *ctr, size_t nblocks); extern void _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_ssse3_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern void _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); #endif #ifdef USE_PADLOCK extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); extern unsigned int _gcry_aes_padlock_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); #endif #ifdef USE_ARM_ASM /* ARM assembly implementations of AES */ extern unsigned int _gcry_aes_arm_encrypt_block(const void *keysched_enc, unsigned char *out, const unsigned char *in, int rounds, const void *encT); extern unsigned int _gcry_aes_arm_decrypt_block(const void *keysched_dec, unsigned char *out, const unsigned char *in, int rounds, const void *decT); #endif /*USE_ARM_ASM*/ #ifdef USE_ARM_CE /* ARMv8 Crypto Extension implementations of AES */ extern void _gcry_aes_armv8_ce_setkey(RIJNDAEL_context *ctx, const byte *key); extern void _gcry_aes_armv8_ce_prepare_decryption(RIJNDAEL_context *ctx); extern unsigned int _gcry_aes_armv8_ce_encrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern unsigned int _gcry_aes_armv8_ce_decrypt(const RIJNDAEL_context *ctx, unsigned char *dst, const unsigned char *src); extern void _gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_armv8_ce_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks, int cbc_mac); extern void _gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *ctr, size_t nblocks); extern void _gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, const unsigned char *inbuf, unsigned char *iv, size_t nblocks); extern void _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); extern void _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); +extern void _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, + unsigned char *tweak, + unsigned char *outbuf, + const unsigned char *inbuf, + size_t nblocks, int encrypt); #endif /*USE_ARM_ASM*/ static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax); /* All the numbers. */ #include "rijndael-tables.h" /* Function prototypes. */ static const char *selftest(void); /* Prefetching for encryption/decryption tables. */ static void prefetch_table(const volatile byte *tab, size_t len) { size_t i; for (i = 0; i < len; i += 8 * 32) { (void)tab[i + 0 * 32]; (void)tab[i + 1 * 32]; (void)tab[i + 2 * 32]; (void)tab[i + 3 * 32]; (void)tab[i + 4 * 32]; (void)tab[i + 5 * 32]; (void)tab[i + 6 * 32]; (void)tab[i + 7 * 32]; } (void)tab[len - 1]; } static void prefetch_enc(void) { prefetch_table((const void *)encT, sizeof(encT)); } static void prefetch_dec(void) { prefetch_table((const void *)&dec_tables, sizeof(dec_tables)); } /* Perform the key setup. */ static gcry_err_code_t do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) { static int initialized = 0; static const char *selftest_failed = 0; int rounds; int i,j, r, t, rconpointer = 0; int KC; #if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \ || defined(USE_ARM_CE) unsigned int hwfeatures; #endif /* The on-the-fly self tests are only run in non-fips mode. In fips mode explicit self-tests are required. Actually the on-the-fly self-tests are not fully thread-safe and it might happen that a failed self-test won't get noticed in another thread. FIXME: We might want to have a central registry of succeeded self-tests. */ if (!fips_mode () && !initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("%s\n", selftest_failed ); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if( keylen == 128/8 ) { rounds = 10; KC = 4; } else if ( keylen == 192/8 ) { rounds = 12; KC = 6; } else if ( keylen == 256/8 ) { rounds = 14; KC = 8; } else return GPG_ERR_INV_KEYLEN; ctx->rounds = rounds; #if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \ || defined(USE_ARM_CE) hwfeatures = _gcry_get_hw_features (); #endif ctx->decryption_prepared = 0; #ifdef USE_PADLOCK ctx->use_padlock = 0; #endif #ifdef USE_AESNI ctx->use_aesni = 0; #endif #ifdef USE_SSSE3 ctx->use_ssse3 = 0; #endif #ifdef USE_ARM_CE ctx->use_arm_ce = 0; #endif if (0) { ; } #ifdef USE_AESNI else if (hwfeatures & HWF_INTEL_AESNI) { ctx->encrypt_fn = _gcry_aes_aesni_encrypt; ctx->decrypt_fn = _gcry_aes_aesni_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_aesni = 1; } #endif #ifdef USE_PADLOCK else if (hwfeatures & HWF_PADLOCK_AES && keylen == 128/8) { ctx->encrypt_fn = _gcry_aes_padlock_encrypt; ctx->decrypt_fn = _gcry_aes_padlock_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_padlock = 1; memcpy (ctx->padlockkey, key, keylen); } #endif #ifdef USE_SSSE3 else if (hwfeatures & HWF_INTEL_SSSE3) { ctx->encrypt_fn = _gcry_aes_ssse3_encrypt; ctx->decrypt_fn = _gcry_aes_ssse3_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_ssse3 = 1; } #endif #ifdef USE_ARM_CE else if (hwfeatures & HWF_ARM_AES) { ctx->encrypt_fn = _gcry_aes_armv8_ce_encrypt; ctx->decrypt_fn = _gcry_aes_armv8_ce_decrypt; ctx->prefetch_enc_fn = NULL; ctx->prefetch_dec_fn = NULL; ctx->use_arm_ce = 1; } #endif else { ctx->encrypt_fn = do_encrypt; ctx->decrypt_fn = do_decrypt; ctx->prefetch_enc_fn = prefetch_enc; ctx->prefetch_dec_fn = prefetch_dec; } /* NB: We don't yet support Padlock hardware key generation. */ if (0) { ; } #ifdef USE_AESNI else if (ctx->use_aesni) _gcry_aes_aesni_do_setkey (ctx, key); #endif #ifdef USE_SSSE3 else if (ctx->use_ssse3) _gcry_aes_ssse3_do_setkey (ctx, key); #endif #ifdef USE_ARM_CE else if (ctx->use_arm_ce) _gcry_aes_armv8_ce_setkey (ctx, key); #endif else { const byte *sbox = ((const byte *)encT) + 1; union { PROPERLY_ALIGNED_TYPE dummy; byte data[MAXKC][4]; u32 data32[MAXKC]; } tkk[2]; #define k tkk[0].data #define k_u32 tkk[0].data32 #define tk tkk[1].data #define tk_u32 tkk[1].data32 #define W (ctx->keyschenc) #define W_u32 (ctx->keyschenc32) prefetch_enc(); for (i = 0; i < keylen; i++) { k[i >> 2][i & 3] = key[i]; } for (j = KC-1; j >= 0; j--) { tk_u32[j] = k_u32[j]; } r = 0; t = 0; /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } while (r < rounds + 1) { /* While not enough round key material calculated calculate new values. */ tk[0][0] ^= sbox[tk[KC-1][1] * 4]; tk[0][1] ^= sbox[tk[KC-1][2] * 4]; tk[0][2] ^= sbox[tk[KC-1][3] * 4]; tk[0][3] ^= sbox[tk[KC-1][0] * 4]; tk[0][0] ^= rcon[rconpointer++]; if (KC != 8) { for (j = 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } else { for (j = 1; j < KC/2; j++) { tk_u32[j] ^= tk_u32[j-1]; } tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4]; tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4]; tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4]; tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4]; for (j = KC/2 + 1; j < KC; j++) { tk_u32[j] ^= tk_u32[j-1]; } } /* Copy values into round key array. */ for (j = 0; (j < KC) && (r < rounds + 1); ) { for (; (j < KC) && (t < 4); j++, t++) { W_u32[r][t] = le_bswap32(tk_u32[j]); } if (t == 4) { r++; t = 0; } } } #undef W #undef tk #undef k #undef W_u32 #undef tk_u32 #undef k_u32 wipememory(&tkk, sizeof(tkk)); } return 0; } static gcry_err_code_t rijndael_setkey (void *context, const byte *key, const unsigned keylen) { RIJNDAEL_context *ctx = context; return do_setkey (ctx, key, keylen); } /* Make a decryption key from an encryption key. */ static void prepare_decryption( RIJNDAEL_context *ctx ) { int r; if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_prepare_decryption (ctx); } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_prepare_decryption (ctx); } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_prepare_decryption (ctx); } #endif /*USE_SSSE3*/ #ifdef USE_PADLOCK else if (ctx->use_padlock) { /* Padlock does not need decryption subkeys. */ } #endif /*USE_PADLOCK*/ else { const byte *sbox = ((const byte *)encT) + 1; prefetch_enc(); prefetch_dec(); ctx->keyschdec32[0][0] = ctx->keyschenc32[0][0]; ctx->keyschdec32[0][1] = ctx->keyschenc32[0][1]; ctx->keyschdec32[0][2] = ctx->keyschenc32[0][2]; ctx->keyschdec32[0][3] = ctx->keyschenc32[0][3]; for (r = 1; r < ctx->rounds; r++) { u32 *wi = ctx->keyschenc32[r]; u32 *wo = ctx->keyschdec32[r]; u32 wt; wt = wi[0]; wo[0] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[1]; wo[1] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[2]; wo[2] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); wt = wi[3]; wo[3] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0) ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1) ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2) ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3); } ctx->keyschdec32[r][0] = ctx->keyschenc32[r][0]; ctx->keyschdec32[r][1] = ctx->keyschenc32[r][1]; ctx->keyschdec32[r][2] = ctx->keyschenc32[r][2]; ctx->keyschdec32[r][3] = ctx->keyschenc32[r][3]; } } #if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) /* Encrypt one block. A and B may be the same. */ static unsigned int do_encrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { #define rk (ctx->keyschenc32) const byte *sbox = ((const byte *)encT) + 1; int rounds = ctx->rounds; int r; u32 sa[4]; u32 sb[4]; sb[0] = buf_get_le32(a + 0); sb[1] = buf_get_le32(a + 4); sb[2] = buf_get_le32(a + 8); sb[3] = buf_get_le32(a + 12); sa[0] = sb[0] ^ rk[0][0]; sa[1] = sb[1] ^ rk[0][1]; sa[2] = sb[2] ^ rk[0][2]; sa[3] = sb[3] ^ rk[0][3]; sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[1][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[1][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[1][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[1][3] ^ sb[3]; for (r = 2; r < rounds; r++) { sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; r++; sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; } /* Last round is special. */ sb[0] = (sbox[(byte)(sa[0] >> (0 * 8)) * 4]) << (0 * 8); sb[3] = (sbox[(byte)(sa[0] >> (1 * 8)) * 4]) << (1 * 8); sb[2] = (sbox[(byte)(sa[0] >> (2 * 8)) * 4]) << (2 * 8); sb[1] = (sbox[(byte)(sa[0] >> (3 * 8)) * 4]) << (3 * 8); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= (sbox[(byte)(sa[1] >> (0 * 8)) * 4]) << (0 * 8); sa[0] ^= (sbox[(byte)(sa[1] >> (1 * 8)) * 4]) << (1 * 8); sb[3] ^= (sbox[(byte)(sa[1] >> (2 * 8)) * 4]) << (2 * 8); sb[2] ^= (sbox[(byte)(sa[1] >> (3 * 8)) * 4]) << (3 * 8); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= (sbox[(byte)(sa[2] >> (0 * 8)) * 4]) << (0 * 8); sa[1] ^= (sbox[(byte)(sa[2] >> (1 * 8)) * 4]) << (1 * 8); sa[0] ^= (sbox[(byte)(sa[2] >> (2 * 8)) * 4]) << (2 * 8); sb[3] ^= (sbox[(byte)(sa[2] >> (3 * 8)) * 4]) << (3 * 8); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= (sbox[(byte)(sa[3] >> (0 * 8)) * 4]) << (0 * 8); sa[2] ^= (sbox[(byte)(sa[3] >> (1 * 8)) * 4]) << (1 * 8); sa[1] ^= (sbox[(byte)(sa[3] >> (2 * 8)) * 4]) << (2 * 8); sa[0] ^= (sbox[(byte)(sa[3] >> (3 * 8)) * 4]) << (3 * 8); sa[3] = rk[r][3] ^ sb[3]; buf_put_le32(b + 0, sa[0]); buf_put_le32(b + 4, sa[1]); buf_put_le32(b + 8, sa[2]); buf_put_le32(b + 12, sa[3]); #undef rk return (56 + 2*sizeof(int)); } #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT); #elif defined(USE_ARM_ASM) return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT); #else return do_encrypt_fn (ctx, bx, ax); #endif /* !USE_ARM_ASM && !USE_AMD64_ASM*/ } static unsigned int rijndael_encrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); return ctx->encrypt_fn (ctx, b, a); } /* Bulk encryption of complete blocks in CFB mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cfb_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_cfb_enc (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_cfb_enc (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_cfb_enc (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_ARM_CE*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; for ( ;nblocks; nblocks-- ) { /* Encrypt the IV. */ burn_depth = encrypt_fn (ctx, iv, iv); /* XOR the input with the IV and store input into IV. */ buf_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption of complete blocks in CBC mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cbc_enc (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned char *last_iv; unsigned int burn_depth = 0; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac); burn_depth = 0; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac); burn_depth = 0; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac); burn_depth = 0; } #endif /*USE_ARM_CE*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; last_iv = iv; for ( ;nblocks; nblocks-- ) { buf_xor(outbuf, inbuf, last_iv, BLOCKSIZE); burn_depth = encrypt_fn (ctx, outbuf, outbuf); last_iv = outbuf; inbuf += BLOCKSIZE; if (!cbc_mac) outbuf += BLOCKSIZE; } if (last_iv != iv) buf_cpy (iv, last_iv, BLOCKSIZE); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption of complete blocks in CTR mode. Caller needs to make sure that CTR is aligned on a 16 byte boundary if AESNI; the minimum alignment is for an u32. This function is only intended for the bulk encryption feature of cipher.c. CTR is expected to be of size BLOCKSIZE. */ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; int i; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks); burn_depth = 0; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks); burn_depth = 0; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks); burn_depth = 0; } #endif /*USE_ARM_CE*/ else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; for ( ;nblocks; nblocks-- ) { /* Encrypt the counter. */ burn_depth = encrypt_fn (ctx, tmp.x1, ctr); /* XOR the input with the encrypted counter and store in output. */ buf_xor(outbuf, tmp.x1, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; /* Increment the counter. */ for (i = BLOCKSIZE; i > 0; i--) { ctr[i-1]++; if (ctr[i-1]) break; } } wipememory(&tmp, sizeof(tmp)); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } #if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM) /* Decrypt one block. A and B may be the same. */ static unsigned int do_decrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b, const unsigned char *a) { #define rk (ctx->keyschdec32) int rounds = ctx->rounds; int r; u32 sa[4]; u32 sb[4]; sb[0] = buf_get_le32(a + 0); sb[1] = buf_get_le32(a + 4); sb[2] = buf_get_le32(a + 8); sb[3] = buf_get_le32(a + 12); sa[0] = sb[0] ^ rk[rounds][0]; sa[1] = sb[1] ^ rk[rounds][1]; sa[2] = sb[2] ^ rk[rounds][2]; sa[3] = sb[3] ^ rk[rounds][3]; for (r = rounds - 1; r > 1; r--) { sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; r--; sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[r][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[r][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[r][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[r][3] ^ sb[3]; } sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8)); sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8)); sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8)); sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8)); sa[0] = rk[1][0] ^ sb[0]; sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8)); sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8)); sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8)); sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8)); sa[1] = rk[1][1] ^ sb[1]; sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8)); sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8)); sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8)); sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8)); sa[2] = rk[1][2] ^ sb[2]; sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8)); sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8)); sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8)); sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8)); sa[3] = rk[1][3] ^ sb[3]; /* Last round is special. */ sb[0] = inv_sbox[(byte)(sa[0] >> (0 * 8))] << (0 * 8); sb[1] = inv_sbox[(byte)(sa[0] >> (1 * 8))] << (1 * 8); sb[2] = inv_sbox[(byte)(sa[0] >> (2 * 8))] << (2 * 8); sb[3] = inv_sbox[(byte)(sa[0] >> (3 * 8))] << (3 * 8); sa[0] = sb[0] ^ rk[0][0]; sb[1] ^= inv_sbox[(byte)(sa[1] >> (0 * 8))] << (0 * 8); sb[2] ^= inv_sbox[(byte)(sa[1] >> (1 * 8))] << (1 * 8); sb[3] ^= inv_sbox[(byte)(sa[1] >> (2 * 8))] << (2 * 8); sa[0] ^= inv_sbox[(byte)(sa[1] >> (3 * 8))] << (3 * 8); sa[1] = sb[1] ^ rk[0][1]; sb[2] ^= inv_sbox[(byte)(sa[2] >> (0 * 8))] << (0 * 8); sb[3] ^= inv_sbox[(byte)(sa[2] >> (1 * 8))] << (1 * 8); sa[0] ^= inv_sbox[(byte)(sa[2] >> (2 * 8))] << (2 * 8); sa[1] ^= inv_sbox[(byte)(sa[2] >> (3 * 8))] << (3 * 8); sa[2] = sb[2] ^ rk[0][2]; sb[3] ^= inv_sbox[(byte)(sa[3] >> (0 * 8))] << (0 * 8); sa[0] ^= inv_sbox[(byte)(sa[3] >> (1 * 8))] << (1 * 8); sa[1] ^= inv_sbox[(byte)(sa[3] >> (2 * 8))] << (2 * 8); sa[2] ^= inv_sbox[(byte)(sa[3] >> (3 * 8))] << (3 * 8); sa[3] = sb[3] ^ rk[0][3]; buf_put_le32(b + 0, sa[0]); buf_put_le32(b + 4, sa[1]); buf_put_le32(b + 8, sa[2]); buf_put_le32(b + 12, sa[3]); #undef rk return (56+2*sizeof(int)); } #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ /* Decrypt one block. AX and BX may be the same. */ static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { #ifdef USE_AMD64_ASM return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, &dec_tables); #elif defined(USE_ARM_ASM) return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds, &dec_tables); #else return do_decrypt_fn (ctx, bx, ax); #endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/ } static inline void check_decryption_preparation (RIJNDAEL_context *ctx) { if ( !ctx->decryption_prepared ) { prepare_decryption ( ctx ); ctx->decryption_prepared = 1; } } static unsigned int rijndael_decrypt (void *context, byte *b, const byte *a) { RIJNDAEL_context *ctx = context; check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); return ctx->decrypt_fn (ctx, b, a); } /* Bulk decryption of complete blocks in CFB mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_cfb_dec (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_cfb_dec (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_cfb_dec (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_ARM_CE*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; for ( ;nblocks; nblocks-- ) { burn_depth = encrypt_fn (ctx, iv, iv); buf_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk decryption of complete blocks in CBC mode. Caller needs to make sure that IV is aligned on an unsigned long boundary. This function is only intended for the bulk encryption feature of cipher.c. */ void _gcry_aes_cbc_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_cbc_dec (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_cbc_dec (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_cbc_dec (ctx, outbuf, inbuf, iv, nblocks); burn_depth = 0; } #endif /*USE_ARM_CE*/ else { unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16; rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn; for ( ;nblocks; nblocks-- ) { /* INBUF is needed later and it may be identical to OUTBUF, so store the intermediate result to SAVEBUF. */ burn_depth = decrypt_fn (ctx, savebuf, inbuf); buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } wipememory(savebuf, sizeof(savebuf)); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); } /* Bulk encryption/decryption of complete blocks in OCB mode. */ size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = (void *)&c->context.c; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; if (encrypt) { if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); } else { check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); } if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); burn_depth = 0; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); burn_depth = 0; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt); burn_depth = 0; } #endif /*USE_ARM_CE*/ else if (encrypt) { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE); buf_cpy (l_tmp.x1, inbuf, BLOCKSIZE); /* Checksum_i = Checksum_{i-1} xor P_i */ buf_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1); buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); buf_cpy (outbuf, l_tmp.x1, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } } else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn; for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE); buf_cpy (l_tmp.x1, inbuf, BLOCKSIZE); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); burn_depth = decrypt_fn (ctx, l_tmp.x1, l_tmp.x1); buf_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE); /* Checksum_i = Checksum_{i-1} xor P_i */ buf_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE); buf_cpy (outbuf, l_tmp.x1, BLOCKSIZE); inbuf += BLOCKSIZE; outbuf += BLOCKSIZE; } } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); return 0; } /* Bulk authentication of complete blocks in OCB mode. */ size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) { RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; unsigned int burn_depth = 0; if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_ocb_auth (c, abuf, nblocks); burn_depth = 0; } #endif /*USE_AESNI*/ #ifdef USE_SSSE3 else if (ctx->use_ssse3) { _gcry_aes_ssse3_ocb_auth (c, abuf, nblocks); burn_depth = 0; } #endif /*USE_SSSE3*/ #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { _gcry_aes_armv8_ce_ocb_auth (c, abuf, nblocks); burn_depth = 0; } #endif /*USE_ARM_CE*/ else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.aad_nblocks; const unsigned char *l = ocb_get_l(c, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_mode.ocb.aad_offset, l, BLOCKSIZE); /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ buf_xor (l_tmp.x1, c->u_mode.ocb.aad_offset, abuf, BLOCKSIZE); burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1); buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp.x1, BLOCKSIZE); abuf += BLOCKSIZE; } wipememory(&l_tmp, sizeof(l_tmp)); } if (burn_depth) _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); return 0; } /* Bulk encryption/decryption of complete blocks in XTS mode. */ void _gcry_aes_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt) { RIJNDAEL_context *ctx = context; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int burn_depth = 0; rijndael_cryptfn_t crypt_fn; u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry; if (encrypt) { if (ctx->prefetch_enc_fn) ctx->prefetch_enc_fn(); crypt_fn = ctx->encrypt_fn; } else { check_decryption_preparation (ctx); if (ctx->prefetch_dec_fn) ctx->prefetch_dec_fn(); crypt_fn = ctx->decrypt_fn; } if (0) ; #ifdef USE_AESNI else if (ctx->use_aesni) { _gcry_aes_aesni_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt); burn_depth = 0; } #endif /*USE_AESNI*/ +#ifdef USE_ARM_CE + else if (ctx->use_arm_ce) + { + _gcry_aes_armv8_ce_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt); + burn_depth = 0; + } +#endif /*USE_ARM_CE*/ else { tweak_next_lo = buf_get_le64 (tweak + 0); tweak_next_hi = buf_get_le64 (tweak + 8); while (nblocks) { tweak_lo = tweak_next_lo; tweak_hi = tweak_next_hi; /* Xor-Encrypt/Decrypt-Xor block. */ tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo; tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi; buf_put_le64 (outbuf + 0, tmp_lo); buf_put_le64 (outbuf + 8, tmp_hi); /* Generate next tweak. */ carry = -(tweak_next_hi >> 63) & 0x87; tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63); tweak_next_lo = (tweak_next_lo << 1) ^ carry; burn_depth = crypt_fn (ctx, outbuf, outbuf); buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo); buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi); outbuf += GCRY_XTS_BLOCK_LEN; inbuf += GCRY_XTS_BLOCK_LEN; nblocks--; } buf_put_le64 (tweak + 0, tweak_next_lo); buf_put_le64 (tweak + 8, tweak_next_hi); } if (burn_depth) _gcry_burn_stack (burn_depth + 5 * sizeof(void *)); } /* Run the self-tests for AES 128. Returns NULL on success. */ static const char* selftest_basic_128 (void) { RIJNDAEL_context *ctx; unsigned char *ctxmem; unsigned char scratch[16]; /* The test vectors are from the AES supplied ones; more or less randomly taken from ecb_tbl.txt (I=42,81,14) */ #if 1 static const unsigned char plaintext_128[16] = { 0x01,0x4B,0xAF,0x22,0x78,0xA6,0x9D,0x33, 0x1D,0x51,0x80,0x10,0x36,0x43,0xE9,0x9A }; static const unsigned char key_128[16] = { 0xE8,0xE9,0xEA,0xEB,0xED,0xEE,0xEF,0xF0, 0xF2,0xF3,0xF4,0xF5,0xF7,0xF8,0xF9,0xFA }; static const unsigned char ciphertext_128[16] = { 0x67,0x43,0xC3,0xD1,0x51,0x9A,0xB4,0xF2, 0xCD,0x9A,0x78,0xAB,0x09,0xA5,0x11,0xBD }; #else /* Test vectors from fips-197, appendix C. */ # warning debug test vectors in use static const unsigned char plaintext_128[16] = { 0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77, 0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff }; static const unsigned char key_128[16] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f /* 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, */ /* 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c */ }; static const unsigned char ciphertext_128[16] = { 0x69,0xc4,0xe0,0xd8,0x6a,0x7b,0x04,0x30, 0xd8,0xcd,0xb7,0x80,0x70,0xb4,0xc5,0x5a }; #endif /* Because gcc/ld can only align the CTX struct on 8 bytes on the stack, we need to allocate that context on the heap. */ ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem); if (!ctx) return "failed to allocate memory"; rijndael_setkey (ctx, key_128, sizeof (key_128)); rijndael_encrypt (ctx, scratch, plaintext_128); if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128))) { xfree (ctxmem); return "AES-128 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); xfree (ctxmem); if (memcmp (scratch, plaintext_128, sizeof (plaintext_128))) return "AES-128 test decryption failed."; return NULL; } /* Run the self-tests for AES 192. Returns NULL on success. */ static const char* selftest_basic_192 (void) { RIJNDAEL_context *ctx; unsigned char *ctxmem; unsigned char scratch[16]; static unsigned char plaintext_192[16] = { 0x76,0x77,0x74,0x75,0xF1,0xF2,0xF3,0xF4, 0xF8,0xF9,0xE6,0xE7,0x77,0x70,0x71,0x72 }; static unsigned char key_192[24] = { 0x04,0x05,0x06,0x07,0x09,0x0A,0x0B,0x0C, 0x0E,0x0F,0x10,0x11,0x13,0x14,0x15,0x16, 0x18,0x19,0x1A,0x1B,0x1D,0x1E,0x1F,0x20 }; static const unsigned char ciphertext_192[16] = { 0x5D,0x1E,0xF2,0x0D,0xCE,0xD6,0xBC,0xBC, 0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA }; ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem); if (!ctx) return "failed to allocate memory"; rijndael_setkey (ctx, key_192, sizeof(key_192)); rijndael_encrypt (ctx, scratch, plaintext_192); if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192))) { xfree (ctxmem); return "AES-192 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); xfree (ctxmem); if (memcmp (scratch, plaintext_192, sizeof (plaintext_192))) return "AES-192 test decryption failed."; return NULL; } /* Run the self-tests for AES 256. Returns NULL on success. */ static const char* selftest_basic_256 (void) { RIJNDAEL_context *ctx; unsigned char *ctxmem; unsigned char scratch[16]; static unsigned char plaintext_256[16] = { 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F, 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21 }; static unsigned char key_256[32] = { 0x08,0x09,0x0A,0x0B,0x0D,0x0E,0x0F,0x10, 0x12,0x13,0x14,0x15,0x17,0x18,0x19,0x1A, 0x1C,0x1D,0x1E,0x1F,0x21,0x22,0x23,0x24, 0x26,0x27,0x28,0x29,0x2B,0x2C,0x2D,0x2E }; static const unsigned char ciphertext_256[16] = { 0x08,0x0E,0x95,0x17,0xEB,0x16,0x77,0x71, 0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3 }; ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem); if (!ctx) return "failed to allocate memory"; rijndael_setkey (ctx, key_256, sizeof(key_256)); rijndael_encrypt (ctx, scratch, plaintext_256); if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256))) { xfree (ctxmem); return "AES-256 test encryption failed."; } rijndael_decrypt (ctx, scratch, scratch); xfree (ctxmem); if (memcmp (scratch, plaintext_256, sizeof (plaintext_256))) return "AES-256 test decryption failed."; return NULL; } /* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR encryption. Returns NULL on success. */ static const char* selftest_ctr_128 (void) { const int nblocks = 8+1; const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); return _gcry_selftest_helper_ctr("AES", &rijndael_setkey, &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize, context_size); } /* Run the self-tests for AES-CBC-128, tests bulk CBC decryption. Returns NULL on success. */ static const char* selftest_cbc_128 (void) { const int nblocks = 8+2; const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); return _gcry_selftest_helper_cbc("AES", &rijndael_setkey, &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize, context_size); } /* Run the self-tests for AES-CFB-128, tests bulk CFB decryption. Returns NULL on success. */ static const char* selftest_cfb_128 (void) { const int nblocks = 8+2; const int blocksize = BLOCKSIZE; const int context_size = sizeof(RIJNDAEL_context); return _gcry_selftest_helper_cfb("AES", &rijndael_setkey, &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize, context_size); } /* Run all the self-tests and return NULL on success. This function is used for the on-the-fly self-tests. */ static const char * selftest (void) { const char *r; if ( (r = selftest_basic_128 ()) || (r = selftest_basic_192 ()) || (r = selftest_basic_256 ()) ) return r; if ( (r = selftest_ctr_128 ()) ) return r; if ( (r = selftest_cbc_128 ()) ) return r; if ( (r = selftest_cfb_128 ()) ) return r; return r; } /* SP800-38a.pdf for AES-128. */ static const char * selftest_fips_128_38a (int requested_mode) { static const struct tv { int mode; const unsigned char key[16]; const unsigned char iv[16]; struct { const unsigned char input[16]; const unsigned char output[16]; } data[4]; } tv[2] = { { GCRY_CIPHER_MODE_CFB, /* F.3.13, CFB128-AES128 */ { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a }, { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20, 0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } }, { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0xc8, 0xa6, 0x45, 0x37, 0xa0, 0xb3, 0xa9, 0x3f, 0xcd, 0xe3, 0xcd, 0xad, 0x9f, 0x1c, 0xe5, 0x8b } }, { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x26, 0x75, 0x1f, 0x67, 0xa3, 0xcb, 0xb1, 0x40, 0xb1, 0x80, 0x8c, 0xf1, 0x87, 0xa4, 0xf4, 0xdf } }, { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 }, { 0xc0, 0x4b, 0x05, 0x35, 0x7c, 0x5d, 0x1c, 0x0e, 0xea, 0xc4, 0xc6, 0x6f, 0x9f, 0xf7, 0xf2, 0xe6 } } } }, { GCRY_CIPHER_MODE_OFB, { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a }, { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20, 0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } }, { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 }, { 0x77, 0x89, 0x50, 0x8d, 0x16, 0x91, 0x8f, 0x03, 0xf5, 0x3c, 0x52, 0xda, 0xc5, 0x4e, 0xd8, 0x25 } }, { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef }, { 0x97, 0x40, 0x05, 0x1e, 0x9c, 0x5f, 0xec, 0xf6, 0x43, 0x44, 0xf7, 0xa8, 0x22, 0x60, 0xed, 0xcc } }, { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 }, { 0x30, 0x4c, 0x65, 0x28, 0xf6, 0x59, 0xc7, 0x78, 0x66, 0xa5, 0x10, 0xd9, 0xc1, 0xd6, 0xae, 0x5e } }, } } }; unsigned char scratch[16]; gpg_error_t err; int tvi, idx; gcry_cipher_hd_t hdenc = NULL; gcry_cipher_hd_t hddec = NULL; #define Fail(a) do { \ _gcry_cipher_close (hdenc); \ _gcry_cipher_close (hddec); \ return a; \ } while (0) gcry_assert (sizeof tv[0].data[0].input == sizeof scratch); gcry_assert (sizeof tv[0].data[0].output == sizeof scratch); for (tvi=0; tvi < DIM (tv); tvi++) if (tv[tvi].mode == requested_mode) break; if (tvi == DIM (tv)) Fail ("no test data for this mode"); err = _gcry_cipher_open (&hdenc, GCRY_CIPHER_AES, tv[tvi].mode, 0); if (err) Fail ("open"); err = _gcry_cipher_open (&hddec, GCRY_CIPHER_AES, tv[tvi].mode, 0); if (err) Fail ("open"); err = _gcry_cipher_setkey (hdenc, tv[tvi].key, sizeof tv[tvi].key); if (!err) err = _gcry_cipher_setkey (hddec, tv[tvi].key, sizeof tv[tvi].key); if (err) Fail ("set key"); err = _gcry_cipher_setiv (hdenc, tv[tvi].iv, sizeof tv[tvi].iv); if (!err) err = _gcry_cipher_setiv (hddec, tv[tvi].iv, sizeof tv[tvi].iv); if (err) Fail ("set IV"); for (idx=0; idx < DIM (tv[tvi].data); idx++) { err = _gcry_cipher_encrypt (hdenc, scratch, sizeof scratch, tv[tvi].data[idx].input, sizeof tv[tvi].data[idx].input); if (err) Fail ("encrypt command"); if (memcmp (scratch, tv[tvi].data[idx].output, sizeof scratch)) Fail ("encrypt mismatch"); err = _gcry_cipher_decrypt (hddec, scratch, sizeof scratch, tv[tvi].data[idx].output, sizeof tv[tvi].data[idx].output); if (err) Fail ("decrypt command"); if (memcmp (scratch, tv[tvi].data[idx].input, sizeof scratch)) Fail ("decrypt mismatch"); } #undef Fail _gcry_cipher_close (hdenc); _gcry_cipher_close (hddec); return NULL; } /* Complete selftest for AES-128 with all modes and driver code. */ static gpg_err_code_t selftest_fips_128 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; what = "low-level"; errtxt = selftest_basic_128 (); if (errtxt) goto failed; if (extended) { what = "cfb"; errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_CFB); if (errtxt) goto failed; what = "ofb"; errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_OFB); if (errtxt) goto failed; } return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES128, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Complete selftest for AES-192. */ static gpg_err_code_t selftest_fips_192 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; (void)extended; /* No extended tests available. */ what = "low-level"; errtxt = selftest_basic_192 (); if (errtxt) goto failed; return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES192, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Complete selftest for AES-256. */ static gpg_err_code_t selftest_fips_256 (int extended, selftest_report_func_t report) { const char *what; const char *errtxt; (void)extended; /* No extended tests available. */ what = "low-level"; errtxt = selftest_basic_256 (); if (errtxt) goto failed; return 0; /* Succeeded. */ failed: if (report) report ("cipher", GCRY_CIPHER_AES256, what, errtxt); return GPG_ERR_SELFTEST_FAILED; } /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t run_selftests (int algo, int extended, selftest_report_func_t report) { gpg_err_code_t ec; switch (algo) { case GCRY_CIPHER_AES128: ec = selftest_fips_128 (extended, report); break; case GCRY_CIPHER_AES192: ec = selftest_fips_192 (extended, report); break; case GCRY_CIPHER_AES256: ec = selftest_fips_256 (extended, report); break; default: ec = GPG_ERR_CIPHER_ALGO; break; } return ec; } static const char *rijndael_names[] = { "RIJNDAEL", "AES128", "AES-128", NULL }; static gcry_cipher_oid_spec_t rijndael_oids[] = { { "2.16.840.1.101.3.4.1.1", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.2", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.3", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.4", GCRY_CIPHER_MODE_CFB }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes = { GCRY_CIPHER_AES, {0, 1}, "AES", rijndael_names, rijndael_oids, 16, 128, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests }; static const char *rijndael192_names[] = { "RIJNDAEL192", "AES-192", NULL }; static gcry_cipher_oid_spec_t rijndael192_oids[] = { { "2.16.840.1.101.3.4.1.21", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.22", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.23", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.24", GCRY_CIPHER_MODE_CFB }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes192 = { GCRY_CIPHER_AES192, {0, 1}, "AES192", rijndael192_names, rijndael192_oids, 16, 192, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests }; static const char *rijndael256_names[] = { "RIJNDAEL256", "AES-256", NULL }; static gcry_cipher_oid_spec_t rijndael256_oids[] = { { "2.16.840.1.101.3.4.1.41", GCRY_CIPHER_MODE_ECB }, { "2.16.840.1.101.3.4.1.42", GCRY_CIPHER_MODE_CBC }, { "2.16.840.1.101.3.4.1.43", GCRY_CIPHER_MODE_OFB }, { "2.16.840.1.101.3.4.1.44", GCRY_CIPHER_MODE_CFB }, { NULL } }; gcry_cipher_spec_t _gcry_cipher_spec_aes256 = { GCRY_CIPHER_AES256, {0, 1}, "AES256", rijndael256_names, rijndael256_oids, 16, 256, sizeof (RIJNDAEL_context), rijndael_setkey, rijndael_encrypt, rijndael_decrypt, NULL, NULL, run_selftests };