diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index f2372281..0e59ff98 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -1,341 +1,449 @@ /* chacha20-amd64-ssse3.S - SSSE3 implementation of ChaCha20 cipher * * Copyright (C) 2017,2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) .text #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS # define ELF(...) __VA_ARGS__ #else # define ELF(...) /*_*/ #endif #ifdef __PIC__ # define RIP (%rip) #else # define RIP #endif /* register macros */ #define INPUT %rdi #define DST %rsi #define SRC %rdx #define NBLKS %rcx #define ROUND %eax /* stack structure */ #define STACK_VEC_X12 (16) #define STACK_VEC_X13 (16 + STACK_VEC_X12) #define STACK_TMP (16 + STACK_VEC_X13) #define STACK_TMP1 (16 + STACK_TMP) #define STACK_TMP2 (16 + STACK_TMP1) #define STACK_MAX (16 + STACK_TMP2) /* vector registers */ #define X0 %xmm0 #define X1 %xmm1 #define X2 %xmm2 #define X3 %xmm3 #define X4 %xmm4 #define X5 %xmm5 #define X6 %xmm6 #define X7 %xmm7 #define X8 %xmm8 #define X9 %xmm9 #define X10 %xmm10 #define X11 %xmm11 #define X12 %xmm12 #define X13 %xmm13 #define X14 %xmm14 #define X15 %xmm15 /********************************************************************** helper macros **********************************************************************/ /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ movdqa x0, t2; \ punpckhdq x1, t2; \ punpckldq x1, x0; \ \ movdqa x2, t1; \ punpckldq x3, t1; \ punpckhdq x3, x2; \ \ movdqa x0, x1; \ punpckhqdq t1, x1; \ punpcklqdq t1, x0; \ \ movdqa t2, x3; \ punpckhqdq x2, x3; \ punpcklqdq x2, t2; \ movdqa t2, x2; /* fill xmm register with 32-bit value from memory */ #define pbroadcastd(mem32, xreg) \ movd mem32, xreg; \ pshufd $0, xreg, xreg; /* xor with unaligned memory operand */ #define pxor_u(umem128, xreg, t) \ movdqu umem128, t; \ pxor t, xreg; /* xor register with unaligned src and save to unaligned dst */ #define xor_src_dst(dst, src, offset, xreg, t) \ pxor_u(offset(src), xreg, t); \ movdqu xreg, offset(dst); #define clear(x) pxor x,x; /********************************************************************** 4-way chacha20 **********************************************************************/ #define ROTATE2(v1,v2,c,tmp1,tmp2) \ movdqa v1, tmp1; \ movdqa v2, tmp2; \ psrld $(32 - (c)), v1; \ pslld $(c), tmp1; \ paddb tmp1, v1; \ psrld $(32 - (c)), v2; \ pslld $(c), tmp2; \ paddb tmp2, v2; #define ROTATE_SHUF_2(v1,v2,shuf) \ pshufb shuf, v1; \ pshufb shuf, v2; #define XOR(ds,s) \ pxor s, ds; #define PLUS(ds,s) \ paddd s, ds; #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \ movdqa .Lshuf_rol16 RIP, tmp1; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 12, tmp1, tmp2); \ movdqa .Lshuf_rol8 RIP, tmp1; \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE_SHUF_2(d1, d2, tmp1); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7, tmp1, tmp2); chacha20_data: .align 16 .Lshuf_rol16: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .Lshuf_rol8: .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.Lcounter1: + .long 1,0,0,0 .Linc_counter: .long 0,1,2,3 .Lunsigned_cmp: .long 0x80000000,0x80000000,0x80000000,0x80000000 .align 8 .globl _gcry_chacha20_amd64_ssse3_blocks4 ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;) _gcry_chacha20_amd64_ssse3_blocks4: /* input: * %rdi: input * %rsi: dst * %rdx: src * %rcx: nblks (multiple of 4) */ pushq %rbp; movq %rsp, %rbp; subq $STACK_MAX, %rsp; andq $~15, %rsp; .Loop4: mov $20, ROUND; /* Construct counter vectors X12 and X13 */ movdqa .Linc_counter RIP, X0; movdqa .Lunsigned_cmp RIP, X2; pbroadcastd((12 * 4)(INPUT), X12); pbroadcastd((13 * 4)(INPUT), X13); paddd X0, X12; movdqa X12, X1; pxor X2, X0; pxor X2, X1; pcmpgtd X1, X0; psubd X0, X13; movdqa X12, (STACK_VEC_X12)(%rsp); movdqa X13, (STACK_VEC_X13)(%rsp); /* Load vectors */ pbroadcastd((0 * 4)(INPUT), X0); pbroadcastd((1 * 4)(INPUT), X1); pbroadcastd((2 * 4)(INPUT), X2); pbroadcastd((3 * 4)(INPUT), X3); pbroadcastd((4 * 4)(INPUT), X4); pbroadcastd((5 * 4)(INPUT), X5); pbroadcastd((6 * 4)(INPUT), X6); pbroadcastd((7 * 4)(INPUT), X7); pbroadcastd((8 * 4)(INPUT), X8); pbroadcastd((9 * 4)(INPUT), X9); pbroadcastd((10 * 4)(INPUT), X10); pbroadcastd((11 * 4)(INPUT), X11); pbroadcastd((14 * 4)(INPUT), X14); pbroadcastd((15 * 4)(INPUT), X15); movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); -.Lround2: +.Lround2_4: QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15) movdqa (STACK_TMP)(%rsp), X11; movdqa (STACK_TMP1)(%rsp), X15; movdqa X8, (STACK_TMP)(%rsp); movdqa X9, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9) QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9) movdqa (STACK_TMP)(%rsp), X8; movdqa (STACK_TMP1)(%rsp), X9; movdqa X11, (STACK_TMP)(%rsp); movdqa X15, (STACK_TMP1)(%rsp); QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15) sub $2, ROUND; - jnz .Lround2; + jnz .Lround2_4; /* tmp := X15 */ movdqa (STACK_TMP)(%rsp), X11; pbroadcastd((0 * 4)(INPUT), X15); PLUS(X0, X15); pbroadcastd((1 * 4)(INPUT), X15); PLUS(X1, X15); pbroadcastd((2 * 4)(INPUT), X15); PLUS(X2, X15); pbroadcastd((3 * 4)(INPUT), X15); PLUS(X3, X15); pbroadcastd((4 * 4)(INPUT), X15); PLUS(X4, X15); pbroadcastd((5 * 4)(INPUT), X15); PLUS(X5, X15); pbroadcastd((6 * 4)(INPUT), X15); PLUS(X6, X15); pbroadcastd((7 * 4)(INPUT), X15); PLUS(X7, X15); pbroadcastd((8 * 4)(INPUT), X15); PLUS(X8, X15); pbroadcastd((9 * 4)(INPUT), X15); PLUS(X9, X15); pbroadcastd((10 * 4)(INPUT), X15); PLUS(X10, X15); pbroadcastd((11 * 4)(INPUT), X15); PLUS(X11, X15); movdqa (STACK_VEC_X12)(%rsp), X15; PLUS(X12, X15); movdqa (STACK_VEC_X13)(%rsp), X15; PLUS(X13, X15); movdqa X13, (STACK_TMP)(%rsp); pbroadcastd((14 * 4)(INPUT), X15); PLUS(X14, X15); movdqa (STACK_TMP1)(%rsp), X15; movdqa X14, (STACK_TMP1)(%rsp); pbroadcastd((15 * 4)(INPUT), X13); PLUS(X15, X13); movdqa X15, (STACK_TMP2)(%rsp); /* Update counter */ addq $4, (12 * 4)(INPUT); transpose_4x4(X0, X1, X2, X3, X13, X14, X15); xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15); xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15); xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15); xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15); transpose_4x4(X4, X5, X6, X7, X0, X1, X2); movdqa (STACK_TMP)(%rsp), X13; movdqa (STACK_TMP1)(%rsp), X14; movdqa (STACK_TMP2)(%rsp), X15; xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0); transpose_4x4(X8, X9, X10, X11, X0, X1, X2); xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0); transpose_4x4(X12, X13, X14, X15, X0, X1, X2); xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0); xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0); xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0); xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0); sub $4, NBLKS; lea (4 * 64)(DST), DST; lea (4 * 64)(SRC), SRC; jnz .Loop4; /* clear the used vector registers and stack */ clear(X0); movdqa X0, (STACK_VEC_X12)(%rsp); movdqa X0, (STACK_VEC_X13)(%rsp); movdqa X0, (STACK_TMP)(%rsp); movdqa X0, (STACK_TMP1)(%rsp); movdqa X0, (STACK_TMP2)(%rsp); clear(X1); clear(X2); clear(X3); clear(X4); clear(X5); clear(X6); clear(X7); clear(X8); clear(X9); clear(X10); clear(X11); clear(X12); clear(X13); clear(X14); clear(X15); /* eax zeroed by round loop. */ leave; ret; ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, .-_gcry_chacha20_amd64_ssse3_blocks4;) +/********************************************************************** + 1-way chacha20 + **********************************************************************/ + +#define ROTATE_SHUF(v1,shuf) \ + pshufb shuf, v1; + +#define ROTATE(v1,c,tmp1) \ + movdqa v1, tmp1; \ + psrld $(32 - (c)), v1; \ + pslld $(c), tmp1; \ + paddb tmp1, v1; + +#define WORD_SHUF(v1,shuf) \ + pshufd $shuf, v1, v1; + +#define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\ + shuf_x2,shuf_x3) \ + PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \ + PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \ + PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \ + PLUS(x2, x3); \ + WORD_SHUF(x3, shuf_x3); \ + XOR(x1, x2); \ + WORD_SHUF(x2, shuf_x2); \ + ROTATE(x1, 7, tmp1); \ + WORD_SHUF(x1, shuf_x1); + +.align 8 +.globl _gcry_chacha20_amd64_ssse3_blocks1 +ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;) + +_gcry_chacha20_amd64_ssse3_blocks1: + /* input: + * %rdi: input + * %rsi: dst + * %rdx: src + * %rcx: nblks + */ + + /* Load constants */ + movdqa .Lcounter1 RIP, X4; + movdqa .Lshuf_rol8 RIP, X5; + movdqa .Lshuf_rol16 RIP, X6; + + /* Load state */ + movdqu (0 * 4)(INPUT), X10; + movdqu (4 * 4)(INPUT), X11; + movdqu (8 * 4)(INPUT), X12; + movdqu (12 * 4)(INPUT), X13; + +.Loop1: + mov $20, ROUND; + + movdqa X10, X0; + movdqa X11, X1; + movdqa X12, X2; + movdqa X13, X3; + +.Lround2_1: + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93); + QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39); + sub $2, ROUND; + jnz .Lround2_1; + + PLUS(X0, X10); + PLUS(X1, X11); + PLUS(X2, X12); + PLUS(X3, X13); + + /* Update counter */ + paddq X4, X13; + + xor_src_dst(DST, SRC, 0 * 4, X0, X7); + xor_src_dst(DST, SRC, 4 * 4, X1, X7); + xor_src_dst(DST, SRC, 8 * 4, X2, X7); + xor_src_dst(DST, SRC, 12 * 4, X3, X7); + + lea (64)(DST), DST; + lea (64)(SRC), SRC; + + sub $1, NBLKS; + jnz .Loop1; + + /* Store counter */ + movdqu X13, (12 * 4)(INPUT); + + /* clear the used vector registers */ + clear(X0); + clear(X1); + clear(X2); + clear(X3); + clear(X4); + clear(X5); + clear(X6); + clear(X7); + clear(X10); + clear(X11); + clear(X12); + clear(X13); + + /* eax zeroed by round loop. */ + ret; +ELF(.size _gcry_chacha20_amd64_ssse3_blocks1, + .-_gcry_chacha20_amd64_ssse3_blocks1;) + #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 84a9b2b8..f1afd18e 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -1,621 +1,640 @@ /* chacha20.c - Bernstein's ChaCha20 cipher * Copyright (C) 2014,2017,2018 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * For a description of the algorithm, see: * http://cr.yp.to/chacha.html */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #include #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ #define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ #define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ #define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_CTR_SIZE 16 /* Bytes. */ /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ #undef USE_AVX2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX2 1 #endif /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */ #undef USE_ARMV7_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_ARMV7_NEON 1 # endif #endif /* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly * code. */ #undef USE_AARCH64_SIMD #ifdef ENABLE_NEON_SUPPORT # if defined(__AARCH64EL__) \ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) # define USE_AARCH64_SIMD 1 # endif #endif /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) #else # define ASM_FUNC_ABI #endif typedef struct CHACHA20_context_s { u32 input[16]; unsigned char pad[CHACHA20_BLOCK_SIZE]; unsigned int unused; /* bytes in the pad. */ int use_ssse3:1; int use_avx2:1; int use_neon:1; } CHACHA20_context_t; #ifdef USE_SSSE3 unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst, + const byte *src, + size_t nblks) ASM_FUNC_ABI; + #endif /* USE_SSSE3 */ #ifdef USE_AVX2 unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; #endif /* USE_AVX2 */ #ifdef USE_ARMV7_NEON unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); #endif /* USE_ARMV7_NEON */ #ifdef USE_AARCH64_SIMD unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); #endif /* USE_AARCH64_SIMD */ static const char *selftest (void); #define ROTATE(v,c) (rol(v,c)) #define XOR(v,w) ((v) ^ (w)) #define PLUS(v,w) ((u32)((v) + (w))) #define PLUSONE(v) (PLUS((v),1)) #define QUARTERROUND(a,b,c,d) \ a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); #define BUF_XOR_LE32(dst, src, offset, x) \ buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x)) static unsigned int -chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) +do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) { u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; unsigned int i; while (nblks) { x0 = input[0]; x1 = input[1]; x2 = input[2]; x3 = input[3]; x4 = input[4]; x5 = input[5]; x6 = input[6]; x7 = input[7]; x8 = input[8]; x9 = input[9]; x10 = input[10]; x11 = input[11]; x12 = input[12]; x13 = input[13]; x14 = input[14]; x15 = input[15]; for (i = 20; i > 0; i -= 2) { QUARTERROUND(x0, x4, x8, x12) QUARTERROUND(x1, x5, x9, x13) QUARTERROUND(x2, x6, x10, x14) QUARTERROUND(x3, x7, x11, x15) QUARTERROUND(x0, x5, x10, x15) QUARTERROUND(x1, x6, x11, x12) QUARTERROUND(x2, x7, x8, x13) QUARTERROUND(x3, x4, x9, x14) } x0 = PLUS(x0, input[0]); x1 = PLUS(x1, input[1]); x2 = PLUS(x2, input[2]); x3 = PLUS(x3, input[3]); x4 = PLUS(x4, input[4]); x5 = PLUS(x5, input[5]); x6 = PLUS(x6, input[6]); x7 = PLUS(x7, input[7]); x8 = PLUS(x8, input[8]); x9 = PLUS(x9, input[9]); x10 = PLUS(x10, input[10]); x11 = PLUS(x11, input[11]); x12 = PLUS(x12, input[12]); x13 = PLUS(x13, input[13]); x14 = PLUS(x14, input[14]); x15 = PLUS(x15, input[15]); input[12] = PLUSONE(input[12]); input[13] = PLUS(input[13], !input[12]); BUF_XOR_LE32(dst, src, 0, x0); BUF_XOR_LE32(dst, src, 4, x1); BUF_XOR_LE32(dst, src, 8, x2); BUF_XOR_LE32(dst, src, 12, x3); BUF_XOR_LE32(dst, src, 16, x4); BUF_XOR_LE32(dst, src, 20, x5); BUF_XOR_LE32(dst, src, 24, x6); BUF_XOR_LE32(dst, src, 28, x7); BUF_XOR_LE32(dst, src, 32, x8); BUF_XOR_LE32(dst, src, 36, x9); BUF_XOR_LE32(dst, src, 40, x10); BUF_XOR_LE32(dst, src, 44, x11); BUF_XOR_LE32(dst, src, 48, x12); BUF_XOR_LE32(dst, src, 52, x13); BUF_XOR_LE32(dst, src, 56, x14); BUF_XOR_LE32(dst, src, 60, x15); src += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; nblks--; } /* burn_stack */ return (17 * sizeof(u32) + 6 * sizeof(void *)); } +static unsigned int +chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, + size_t nblks) +{ +#ifdef USE_SSSE3 + if (ctx->use_ssse3) + { + return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks); + } +#endif + + return do_chacha20_blocks (ctx->input, dst, src, nblks); +} + + static void chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static const char sigma[16] = "expand 32-byte k"; static const char tau[16] = "expand 16-byte k"; const char *constants; ctx->input[4] = buf_get_le32(key + 0); ctx->input[5] = buf_get_le32(key + 4); ctx->input[6] = buf_get_le32(key + 8); ctx->input[7] = buf_get_le32(key + 12); if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */ { key += 16; constants = sigma; } else /* 128 bits */ { constants = tau; } ctx->input[8] = buf_get_le32(key + 0); ctx->input[9] = buf_get_le32(key + 4); ctx->input[10] = buf_get_le32(key + 8); ctx->input[11] = buf_get_le32(key + 12); ctx->input[0] = buf_get_le32(constants + 0); ctx->input[1] = buf_get_le32(constants + 4); ctx->input[2] = buf_get_le32(constants + 8); ctx->input[3] = buf_get_le32(constants + 12); } static void chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen) { if (ivlen == CHACHA20_CTR_SIZE) { ctx->input[12] = buf_get_le32 (iv + 0); ctx->input[13] = buf_get_le32 (iv + 4); ctx->input[14] = buf_get_le32 (iv + 8); ctx->input[15] = buf_get_le32 (iv + 12); } else if (ivlen == CHACHA20_MAX_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = buf_get_le32 (iv + 0); ctx->input[14] = buf_get_le32 (iv + 4); ctx->input[15] = buf_get_le32 (iv + 8); } else if (ivlen == CHACHA20_MIN_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = buf_get_le32 (iv + 0); ctx->input[15] = buf_get_le32 (iv + 4); } else { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = 0; ctx->input[15] = 0; } } static void chacha20_setiv (void *context, const byte *iv, size_t ivlen) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE && ivlen != CHACHA20_CTR_SIZE) log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE || ivlen == CHACHA20_CTR_SIZE)) chacha20_ivsetup (ctx, iv, ivlen); else chacha20_ivsetup (ctx, NULL, 0); /* Reset the unused pad bytes counter. */ ctx->unused = 0; } static gcry_err_code_t chacha20_do_setkey (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static int initialized; static const char *selftest_failed; unsigned int features = _gcry_get_hw_features (); if (!initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) return GPG_ERR_INV_KEYLEN; #ifdef USE_SSSE3 ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif #ifdef USE_AVX2 ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0; #endif #ifdef USE_ARMV7_NEON ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_AARCH64_SIMD ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif (void)features; chacha20_keysetup (ctx, key, keylen); /* We default to a zero nonce. */ chacha20_setiv (ctx, NULL, 0); return 0; } static gcry_err_code_t chacha20_setkey (void *context, const byte *key, unsigned int keylen, gcry_cipher_hd_t hd) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen); (void)hd; _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); return rc; } static void chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, size_t length) { static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, }; CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; unsigned int nburn, burn = 0; if (!length) return; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) return; gcry_assert (!ctx->unused); } #ifdef USE_AVX2 if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_ARMV7_NEON if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AARCH64_SIMD if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; - nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks); + nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length > 0) { - nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1); + nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, ctx->pad, length); ctx->unused = CHACHA20_BLOCK_SIZE - length; } _gcry_burn_stack (burn); } static const char * selftest (void) { byte ctxbuf[sizeof(CHACHA20_context_t) + 15]; CHACHA20_context_t *ctx; byte scratch[127 + 1]; byte buf[512 + 64 + 4]; int i; /* From draft-strombergson-chacha-test-vectors */ static byte key_1[] = { 0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78, 0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35, 0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb, 0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d }; static const byte nonce_1[] = { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 }; static const byte plaintext_1[127] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const byte ciphertext_1[127] = { 0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9, 0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06, 0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00, 0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf, 0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd, 0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f, 0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f, 0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92, 0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9, 0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36, 0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1, 0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38, 0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea, 0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0, 0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27, 0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33 }; /* 16-byte alignment required for amd64 implementation. */ ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15); chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); scratch[sizeof (scratch) - 1] = 0; chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1); if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) return "ChaCha20 encryption test 1 failed."; if (scratch[sizeof (scratch) - 1]) return "ChaCha20 wrote too much."; chacha20_setkey (ctx, key_1, sizeof (key_1), NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1); if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) return "ChaCha20 decryption test 1 failed."; for (i = 0; i < sizeof buf; i++) buf[i] = i; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /*encrypt */ chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); /*decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, 1); chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1); chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1, buf + (sizeof buf) - 1, 1); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 2 failed."; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /* encrypt */ for (i = 0; i < sizeof buf; i++) chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1); /* decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 3 failed."; return NULL; } gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = { GCRY_CIPHER_CHACHA20, {0, 0}, /* flags */ "CHACHA20", /* name */ NULL, /* aliases */ NULL, /* oids */ 1, /* blocksize in bytes. */ CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */ sizeof (CHACHA20_context_t), chacha20_setkey, NULL, NULL, chacha20_encrypt_stream, chacha20_encrypt_stream, NULL, NULL, chacha20_setiv };