diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S index 608fb64e..c04015a2 100644 --- a/cipher/cast5-amd64.S +++ b/cipher/cast5-amd64.S @@ -1,592 +1,605 @@ /* cast5-amd64.S - AMD64 assembly implementation of CAST5 cipher * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5) #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__) # define GET_EXTERN_POINTER(name, reg) movabsq $name, reg #else -# define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg +# ifdef __code_model_large__ +# define GET_EXTERN_POINTER(name, reg) \ + pushq %r15; \ + pushq %r14; \ + 1: leaq 1b(%rip), reg; \ + movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \ + movabsq $name@GOT, %r15; \ + addq %r14, reg; \ + popq %r14; \ + movq (reg, %r15), reg; \ + popq %r15; +# else +# define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg +# endif #endif #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS # define ELF(...) __VA_ARGS__ #else # define ELF(...) /*_*/ #endif .text .extern _gcry_cast5_s1to4; #define s1 0 #define s2 (s1 + (4 * 256)) #define s3 (s2 + (4 * 256)) #define s4 (s3 + (4 * 256)) /* structure of CAST5_context: */ #define Km 0 #define Kr (Km + (16 * 4)) /* register macros */ #define CTX %rdi #define RIO %rsi #define RTAB %r8 #define RLR0 %r9 #define RLR1 %r10 #define RLR2 %r11 #define RLR3 %r12 #define RLR0d %r9d #define RLR1d %r10d #define RLR2d %r11d #define RLR3d %r12d #define RX0 %rax #define RX1 %rbx #define RX2 %rdx #define RX0d %eax #define RX1d %ebx #define RX2d %edx #define RX0bl %al #define RX1bl %bl #define RX2bl %dl #define RX0bh %ah #define RX1bh %bh #define RX2bh %dh #define RKR %rcx #define RKRd %ecx #define RKRbl %cl #define RT0 %rbp #define RT1 %rsi #define RT0d %ebp #define RT1d %esi #define RKM0d %r13d #define RKM1d %r14d /*********************************************************************** * 1-way cast5 ***********************************************************************/ #define dummy(x) #define shr_kr(none) \ shrq $8, RKR; #define F(km, load_next_kr, op0, op1, op2, op3) \ op0 ## l RLR0d, km ## d; \ roll RKRbl, km ## d; \ rorq $32, RLR0; \ movzbl km ## bh, RT0d; \ movzbl km ## bl, RT1d; \ roll $16, km ## d; \ movl s1(RTAB,RT0,4), RT0d; \ op1 ## l s2(RTAB,RT1,4), RT0d; \ load_next_kr(kr_next); \ movzbl km ## bh, RT1d; \ movzbl km ## bl, km ## d; \ op2 ## l s3(RTAB,RT1,4), RT0d; \ op3 ## l s4(RTAB,km,4), RT0d; \ xorq RT0, RLR0; #define F1(km, load_next_kr) \ F(##km, load_next_kr, add, xor, sub, add) #define F2(km, load_next_kr) \ F(##km, load_next_kr, xor, sub, add, xor) #define F3(km, load_next_kr) \ F(##km, load_next_kr, sub, add, xor, sub) #define get_round_km(n, km) \ movl Km+4*(n)(CTX), km; #define get_round_kr_enc(n) \ movq $0x1010101010101010, RKR; \ \ /* merge rorl rk and rorl $16 */ \ xorq Kr+(n)(CTX), RKR; #define get_round_kr_dec(n) \ movq $0x1010101010101010, RKR; \ \ /* merge rorl rk and rorl $16 */ \ xorq Kr+(n - 7)(CTX), RKR; \ bswapq RKR; #define round_enc(n, FA, FB, fn1, fn2) \ get_round_km(n + 1, RX2d); \ FA(RX0, fn1); \ get_round_km(n + 2, RX0d); \ FB(RX2, fn2); #define round_enc_last(n, FXA, FXB) \ get_round_km(n + 1, RX2d); \ \ FXA(RX0, shr_kr); \ FXB(RX2, dummy); #define round_enc_1(n, FA, FB) \ round_enc(n, FA, FB, shr_kr, shr_kr) #define round_enc_2(n, FA, FB) \ round_enc(n, FA, FB, shr_kr, dummy) #define round_dec(n, FA, FB, fn1, fn2) \ get_round_km(n - 1, RX2d); \ FA(RX0, fn1); \ get_round_km(n - 2, RX0d); \ FB(RX2, fn2); #define round_dec_last(n, FXA, FXB) \ get_round_km(n - 1, RX2d); \ FXA(RX0, shr_kr); \ FXB(RX2, dummy); #define round_dec_1(n, FA, FB) \ round_dec(n, FA, FB, shr_kr, shr_kr) #define round_dec_2(n, FA, FB) \ round_dec(n, FA, FB, shr_kr, dummy) #define read_block() \ movq (RIO), RLR0; \ bswapq RLR0; #define write_block() \ bswapq RLR0; \ rorq $32, RLR0; \ movq RLR0, (RIO); .align 8 .globl _gcry_cast5_amd64_encrypt_block ELF(.type _gcry_cast5_amd64_encrypt_block,@function;) _gcry_cast5_amd64_encrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ pushq %rbp; pushq %rbx; movq %rsi, %r10; GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); movq %rdx, RIO; read_block(); get_round_km(0, RX0d); get_round_kr_enc(0); round_enc_1(0, F1, F2); round_enc_1(2, F3, F1); round_enc_1(4, F2, F3); round_enc_2(6, F1, F2); get_round_kr_enc(8); round_enc_1(8, F3, F1); round_enc_1(10, F2, F3); round_enc_1(12, F1, F2); round_enc_last(14, F3, F1); movq %r10, RIO; write_block(); popq %rbx; popq %rbp; ret; ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;) .align 8 .globl _gcry_cast5_amd64_decrypt_block ELF(.type _gcry_cast5_amd64_decrypt_block,@function;) _gcry_cast5_amd64_decrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ pushq %rbp; pushq %rbx; movq %rsi, %r10; GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); movq %rdx, RIO; read_block(); get_round_km(15, RX0d); get_round_kr_dec(15); round_dec_1(15, F1, F3); round_dec_1(13, F2, F1); round_dec_1(11, F3, F2); round_dec_2(9, F1, F3); get_round_kr_dec(7); round_dec_1(7, F2, F1); round_dec_1(5, F3, F2); round_dec_1(3, F1, F3); round_dec_last(1, F2, F1); movq %r10, RIO; write_block(); popq %rbx; popq %rbp; ret; ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;) /********************************************************************** 4-way cast5, four blocks parallel **********************************************************************/ #define F_tail(rlr, rx, op1, op2, op3) \ movzbl rx ## bh, RT0d; \ movzbl rx ## bl, RT1d; \ roll $16, rx ## d; \ movl s1(RTAB,RT0,4), RT0d; \ op1 ## l s2(RTAB,RT1,4), RT0d; \ movzbl rx ## bh, RT1d; \ movzbl rx ## bl, rx ## d; \ op2 ## l s3(RTAB,RT1,4), RT0d; \ op3 ## l s4(RTAB,rx,4), RT0d; \ xorq RT0, rlr; #define F4(km, load_next_kr, op0, op1, op2, op3) \ movl km, RX0d; \ op0 ## l RLR0d, RX0d; \ roll RKRbl, RX0d; \ rorq $32, RLR0; \ \ movl km, RX1d; \ op0 ## l RLR1d, RX1d; \ roll RKRbl, RX1d; \ rorq $32, RLR1; \ \ movl km, RX2d; \ op0 ## l RLR2d, RX2d; \ roll RKRbl, RX2d; \ rorq $32, RLR2; \ \ F_tail(RLR0, RX0, op1, op2, op3); \ F_tail(RLR1, RX1, op1, op2, op3); \ F_tail(RLR2, RX2, op1, op2, op3); \ \ movl km, RX0d; \ op0 ## l RLR3d, RX0d; \ roll RKRbl, RX0d; \ load_next_kr(); \ rorq $32, RLR3; \ \ F_tail(RLR3, RX0, op1, op2, op3); #define F4_1(km, load_next_kr) \ F4(km, load_next_kr, add, xor, sub, add) #define F4_2(km, load_next_kr) \ F4(km, load_next_kr, xor, sub, add, xor) #define F4_3(km, load_next_kr) \ F4(km, load_next_kr, sub, add, xor, sub) #define round_enc4(n, FA, FB, fn1, fn2) \ get_round_km(n + 1, RKM1d); \ FA(RKM0d, fn1); \ get_round_km(n + 2, RKM0d); \ FB(RKM1d, fn2); #define round_enc_last4(n, FXA, FXB) \ get_round_km(n + 1, RKM1d); \ FXA(RKM0d, shr_kr); \ FXB(RKM1d, dummy); #define round_enc4_1(n, FA, FB) \ round_enc4(n, FA, FB, shr_kr, shr_kr); #define round_enc4_2(n, FA, FB) \ round_enc4(n, FA, FB, shr_kr, dummy); #define round_dec4(n, FA, FB, fn1, fn2) \ get_round_km(n - 1, RKM1d); \ FA(RKM0d, fn1); \ get_round_km(n - 2, RKM0d); \ FB(RKM1d, fn2); #define round_dec_last4(n, FXA, FXB) \ get_round_km(n - 1, RKM1d); \ FXA(RKM0d, shr_kr); \ FXB(RKM1d, dummy); #define round_dec4_1(n, FA, FB) \ round_dec4(n, FA, FB, shr_kr, shr_kr); #define round_dec4_2(n, FA, FB) \ round_dec4(n, FA, FB, shr_kr, dummy); #define inbswap_block4(a, b, c, d) \ bswapq a; \ bswapq b; \ bswapq c; \ bswapq d; #define outbswap_block4(a, b, c, d) \ bswapq a; \ bswapq b; \ bswapq c; \ bswapq d; \ rorq $32, a; \ rorq $32, b; \ rorq $32, c; \ rorq $32, d; .align 8 ELF(.type __cast5_enc_blk4,@function;) __cast5_enc_blk4: /* input: * %rdi: ctx, CTX * RLR0,RLR1,RLR2,RLR3: four input plaintext blocks * output: * RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks */ GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); get_round_km(0, RKM0d); get_round_kr_enc(0); round_enc4_1(0, F4_1, F4_2); round_enc4_1(2, F4_3, F4_1); round_enc4_1(4, F4_2, F4_3); round_enc4_2(6, F4_1, F4_2); get_round_kr_enc(8); round_enc4_1(8, F4_3, F4_1); round_enc4_1(10, F4_2, F4_3); round_enc4_1(12, F4_1, F4_2); round_enc_last4(14, F4_3, F4_1); outbswap_block4(RLR0, RLR1, RLR2, RLR3); ret; ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;) .align 8 ELF(.type __cast5_dec_blk4,@function;) __cast5_dec_blk4: /* input: * %rdi: ctx, CTX * RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks * output: * RLR0,RLR1,RLR2,RLR3: four output plaintext blocks */ GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); inbswap_block4(RLR0, RLR1, RLR2, RLR3); get_round_km(15, RKM0d); get_round_kr_dec(15); round_dec4_1(15, F4_1, F4_3); round_dec4_1(13, F4_2, F4_1); round_dec4_1(11, F4_3, F4_2); round_dec4_2(9, F4_1, F4_3); get_round_kr_dec(7); round_dec4_1(7, F4_2, F4_1); round_dec4_1(5, F4_3, F4_2); round_dec4_1(3, F4_1, F4_3); round_dec_last4(1, F4_2, F4_1); outbswap_block4(RLR0, RLR1, RLR2, RLR3); ret; ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;) .align 8 .globl _gcry_cast5_amd64_ctr_enc ELF(.type _gcry_cast5_amd64_ctr_enc,@function;) _gcry_cast5_amd64_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv (big endian, 64bit) */ pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %rsi; pushq %rdx; /* load IV and byteswap */ movq (%rcx), RX0; bswapq RX0; movq RX0, RLR0; /* construct IVs */ leaq 1(RX0), RLR1; leaq 2(RX0), RLR2; leaq 3(RX0), RLR3; leaq 4(RX0), RX0; bswapq RX0; /* store new IV */ movq RX0, (%rcx); call __cast5_enc_blk4; popq %r14; /*src*/ popq %r13; /*dst*/ /* XOR key-stream with plaintext */ xorq 0 * 8(%r14), RLR0; xorq 1 * 8(%r14), RLR1; xorq 2 * 8(%r14), RLR2; xorq 3 * 8(%r14), RLR3; movq RLR0, 0 * 8(%r13); movq RLR1, 1 * 8(%r13); movq RLR2, 2 * 8(%r13); movq RLR3, 3 * 8(%r13); popq %r14; popq %r13; popq %r12; popq %rbx; popq %rbp; ret ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;) .align 8 .globl _gcry_cast5_amd64_cbc_dec ELF(.type _gcry_cast5_amd64_cbc_dec,@function;) _gcry_cast5_amd64_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv (64bit) */ pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %rcx; pushq %rsi; pushq %rdx; /* load input */ movq 0 * 8(%rdx), RLR0; movq 1 * 8(%rdx), RLR1; movq 2 * 8(%rdx), RLR2; movq 3 * 8(%rdx), RLR3; call __cast5_dec_blk4; popq RX0; /*src*/ popq RX1; /*dst*/ popq RX2; /*iv*/ movq 3 * 8(RX0), %r14; xorq (RX2), RLR0; xorq 0 * 8(RX0), RLR1; xorq 1 * 8(RX0), RLR2; xorq 2 * 8(RX0), RLR3; movq %r14, (RX2); /* store new IV */ movq RLR0, 0 * 8(RX1); movq RLR1, 1 * 8(RX1); movq RLR2, 2 * 8(RX1); movq RLR3, 3 * 8(RX1); popq %r14; popq %r13; popq %r12; popq %rbx; popq %rbp; ret; ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;) .align 8 .globl _gcry_cast5_amd64_cfb_dec ELF(.type _gcry_cast5_amd64_cfb_dec,@function;) _gcry_cast5_amd64_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (8 blocks) * %rdx: src (8 blocks) * %rcx: iv (64bit) */ pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %rsi; pushq %rdx; /* Load input */ movq (%rcx), RLR0; movq 0 * 8(%rdx), RLR1; movq 1 * 8(%rdx), RLR2; movq 2 * 8(%rdx), RLR3; inbswap_block4(RLR0, RLR1, RLR2, RLR3); /* Update IV */ movq 3 * 8(%rdx), %rdx; movq %rdx, (%rcx); call __cast5_enc_blk4; popq %rdx; /*src*/ popq %rcx; /*dst*/ xorq 0 * 8(%rdx), RLR0; xorq 1 * 8(%rdx), RLR1; xorq 2 * 8(%rdx), RLR2; xorq 3 * 8(%rdx), RLR3; movq RLR0, 0 * 8(%rcx); movq RLR1, 1 * 8(%rcx); movq RLR2, 2 * 8(%rcx); movq RLR3, 3 * 8(%rcx); popq %r14; popq %r13; popq %r12; popq %rbx; popq %rbp; ret; ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;) #endif /*defined(USE_CAST5)*/ #endif /*__x86_64*/