diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S index 8b4d7499..682798fe 100644 --- a/cipher/chacha20-amd64-avx512.S +++ b/cipher/chacha20-amd64-avx512.S @@ -1,300 +1,735 @@ /* chacha20-amd64-avx512.S - AVX512 implementation of ChaCha20 cipher * * Copyright (C) 2022 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) .text #include "asm-common-amd64.h" /* register macros */ #define INPUT %rdi #define DST %rsi #define SRC %rdx #define NBLKS %rcx #define ROUND %eax /* vector registers */ #define X0 %zmm0 #define X1 %zmm1 #define X2 %zmm2 #define X3 %zmm3 #define X4 %zmm4 #define X5 %zmm5 #define X6 %zmm6 #define X7 %zmm7 #define X8 %zmm8 #define X9 %zmm9 #define X10 %zmm10 #define X11 %zmm11 #define X12 %zmm12 #define X13 %zmm13 #define X14 %zmm14 #define X15 %zmm15 +#define X0y %ymm0 +#define X1y %ymm1 +#define X2y %ymm2 +#define X3y %ymm3 +#define X4y %ymm4 +#define X5y %ymm5 +#define X6y %ymm6 +#define X7y %ymm7 +#define X8y %ymm8 +#define X9y %ymm9 +#define X10y %ymm10 +#define X11y %ymm11 +#define X12y %ymm12 +#define X13y %ymm13 +#define X14y %ymm14 +#define X15y %ymm15 +#define X0x %xmm0 +#define X1x %xmm1 +#define X2x %xmm2 +#define X3x %xmm3 +#define X4x %xmm4 +#define X5x %xmm5 +#define X6x %xmm6 +#define X7x %xmm7 +#define X8x %xmm8 +#define X9x %xmm9 +#define X10x %xmm10 +#define X11x %xmm11 +#define X12x %xmm12 +#define X13x %xmm13 +#define X14x %xmm14 +#define X15x %xmm15 #define TMP0 %zmm16 #define TMP1 %zmm17 +#define TMP0y %ymm16 +#define TMP1y %ymm17 +#define TMP0x %xmm16 +#define TMP1x %xmm17 #define COUNTER_ADD %zmm18 +#define COUNTER_ADDy %ymm18 +#define COUNTER_ADDx %xmm18 #define X12_SAVE %zmm19 +#define X12_SAVEy %ymm19 +#define X12_SAVEx %xmm19 #define X13_SAVE %zmm20 +#define X13_SAVEy %ymm20 +#define X13_SAVEx %xmm20 #define S0 %zmm21 #define S1 %zmm22 #define S2 %zmm23 #define S3 %zmm24 #define S4 %zmm25 #define S5 %zmm26 #define S6 %zmm27 #define S7 %zmm28 #define S8 %zmm29 #define S14 %zmm30 #define S15 %zmm31 +#define S0y %ymm21 +#define S1y %ymm22 +#define S2y %ymm23 +#define S3y %ymm24 +#define S4y %ymm25 +#define S5y %ymm26 +#define S6y %ymm27 +#define S7y %ymm28 +#define S8y %ymm29 +#define S14y %ymm30 +#define S15y %ymm31 +#define S0x %xmm21 +#define S1x %xmm22 +#define S2x %xmm23 +#define S3x %xmm24 +#define S4x %xmm25 +#define S5x %xmm26 +#define S6x %xmm27 +#define S7x %xmm28 +#define S8x %xmm29 +#define S14x %xmm30 +#define S15x %xmm31 /********************************************************************** helper macros **********************************************************************/ /* 4x4 32-bit integer matrix transpose */ #define transpose_4x4(x0,x1,x2,x3,t1,t2) \ vpunpckhdq x1, x0, t2; \ vpunpckldq x1, x0, x0; \ \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x2; \ \ vpunpckhqdq t1, x0, x1; \ vpunpcklqdq t1, x0, x0; \ \ vpunpckhqdq x2, t2, x3; \ vpunpcklqdq x2, t2, x2; /* 4x4 128-bit matrix transpose */ #define transpose_16byte_4x4(x0,x1,x2,x3,t1,t2) \ vshufi32x4 $0xee, x1, x0, t2; \ vshufi32x4 $0x44, x1, x0, x0; \ \ vshufi32x4 $0x44, x3, x2, t1; \ vshufi32x4 $0xee, x3, x2, x2; \ \ vshufi32x4 $0xdd, t1, x0, x1; \ vshufi32x4 $0x88, t1, x0, x0; \ \ vshufi32x4 $0xdd, x2, t2, x3; \ vshufi32x4 $0x88, x2, t2, x2; +/* 2x2 128-bit matrix transpose */ +#define transpose_16byte_2x2(x0,x1,t1) \ + vmovdqa32 x0, t1; \ + vshufi32x4 $0x0, x1, x0, x0; \ + vshufi32x4 $0x3, x1, t1, x1; + #define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \ vpxord (offset + 0 * (add))(src), x0, x0; \ vpxord (offset + 1 * (add))(src), x4, x4; \ vpxord (offset + 2 * (add))(src), x8, x8; \ vpxord (offset + 3 * (add))(src), x12, x12; \ vmovdqu32 x0, (offset + 0 * (add))(dst); \ vmovdqu32 x4, (offset + 1 * (add))(dst); \ vmovdqu32 x8, (offset + 2 * (add))(dst); \ vmovdqu32 x12, (offset + 3 * (add))(dst); #define xor_src_dst(dst, src, offset, xreg) \ vpxord offset(src), xreg, xreg; \ vmovdqu32 xreg, offset(dst); #define clear_vec4(v0,v1,v2,v3) \ vpxord v0, v0, v0; \ vpxord v1, v1, v1; \ vpxord v2, v2, v2; \ vpxord v3, v3, v3; #define clear_zmm16_zmm31() \ clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \ clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \ clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \ clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31); /********************************************************************** - 16-way chacha20 + 16-way (zmm), 8-way (ymm), 4-way (xmm) chacha20 **********************************************************************/ #define ROTATE2(v1,v2,c) \ vprold $(c), v1, v1; \ vprold $(c), v2, v2; #define XOR(ds,s) \ vpxord s, ds, ds; #define PLUS(ds,s) \ vpaddd s, ds, ds; -#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \ +#define QUARTERROUND2V(a1,b1,c1,d1,a2,b2,c2,d2) \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE2(d1, d2, 16); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 12); \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE2(d1, d2, 8); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7); +/********************************************************************** + 1-way/2-way (xmm) chacha20 + **********************************************************************/ + +#define ROTATE(v1,c) \ + vprold $(c), v1, v1; \ + +#define WORD_SHUF(v1,shuf) \ + vpshufd $shuf, v1, v1; + +#define QUARTERROUND1H(x0,x1,x2,x3,shuf_x1,shuf_x2,shuf_x3) \ + PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 16); \ + PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12); \ + PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 8); \ + PLUS(x2, x3); \ + WORD_SHUF(x3, shuf_x3); \ + XOR(x1, x2); \ + WORD_SHUF(x2, shuf_x2); \ + ROTATE(x1, 7); \ + WORD_SHUF(x1, shuf_x1); + +#define QUARTERROUND2H(x0,x1,x2,x3,y0,y1,y2,y3,shuf_x1,shuf_x2,shuf_x3) \ + PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \ + ROTATE(x3, 16); ROTATE(y3, 16); \ + PLUS(x2, x3); PLUS(y2, y3); XOR(x1, x2); XOR(y1, y2); \ + ROTATE(x1, 12); ROTATE(y1, 12); \ + PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \ + ROTATE(x3, 8); ROTATE(y3, 8); \ + PLUS(x2, x3); PLUS(y2, y3); \ + WORD_SHUF(x3, shuf_x3); WORD_SHUF(y3, shuf_x3); \ + XOR(x1, x2); XOR(y1, y2); \ + WORD_SHUF(x2, shuf_x2); WORD_SHUF(y2, shuf_x2); \ + ROTATE(x1, 7); ROTATE(y1, 7); \ + WORD_SHUF(x1, shuf_x1); WORD_SHUF(y1, shuf_x1); + .align 64 ELF(.type _gcry_chacha20_amd64_avx512_data,@object;) _gcry_chacha20_amd64_avx512_data: -.Linc_counter: - .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lcounter_0_1_2_3: +.Lcounter_0_1: + .long 0,0,0,0 .Lone: .long 1,0,0,0 +.Lcounter_2_3: +.Ltwo: + .long 2,0,0,0 +.Lthree: + .long 3,0,0,0 +.Linc_counter: + .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data) .align 16 -.globl _gcry_chacha20_amd64_avx512_blocks16 -ELF(.type _gcry_chacha20_amd64_avx512_blocks16,@function;) -_gcry_chacha20_amd64_avx512_blocks16: +.globl _gcry_chacha20_amd64_avx512_blocks +ELF(.type _gcry_chacha20_amd64_avx512_blocks,@function;) +_gcry_chacha20_amd64_avx512_blocks: /* input: * %rdi: input * %rsi: dst * %rdx: src - * %rcx: nblks (multiple of 16) + * %rcx: nblks */ CFI_STARTPROC(); vpxord %xmm16, %xmm16, %xmm16; - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + vpopcntb %xmm16, %xmm16; /* spec stop for old AVX512 CPUs */ + + cmpq $4, NBLKS; + jb .Lskip_vertical_handling; + /* Load constants */ vpmovzxbd .Linc_counter rRIP, COUNTER_ADD; + kxnorq %k1, %k1, %k1; + + cmpq $16, NBLKS; + jae .Lprocess_16v; + + /* Preload state to YMM registers */ + vpbroadcastd (0 * 4)(INPUT), S0y; + vpbroadcastd (1 * 4)(INPUT), S1y; + vpbroadcastd (2 * 4)(INPUT), S2y; + vpbroadcastd (3 * 4)(INPUT), S3y; + vpbroadcastd (4 * 4)(INPUT), S4y; + vpbroadcastd (5 * 4)(INPUT), S5y; + vpbroadcastd (6 * 4)(INPUT), S6y; + vpbroadcastd (7 * 4)(INPUT), S7y; + vpbroadcastd (8 * 4)(INPUT), S8y; + vpbroadcastd (14 * 4)(INPUT), S14y; + vpbroadcastd (15 * 4)(INPUT), S15y; + jmp .Lskip16v; + +.align 16 +.Lprocess_16v: + /* Process 16 ChaCha20 blocks */ - /* Preload state */ + /* Preload state to ZMM registers */ vpbroadcastd (0 * 4)(INPUT), S0; vpbroadcastd (1 * 4)(INPUT), S1; vpbroadcastd (2 * 4)(INPUT), S2; vpbroadcastd (3 * 4)(INPUT), S3; vpbroadcastd (4 * 4)(INPUT), S4; vpbroadcastd (5 * 4)(INPUT), S5; vpbroadcastd (6 * 4)(INPUT), S6; vpbroadcastd (7 * 4)(INPUT), S7; vpbroadcastd (8 * 4)(INPUT), S8; vpbroadcastd (14 * 4)(INPUT), S14; vpbroadcastd (15 * 4)(INPUT), S15; -.align 16 -.Loop16: movl $20, ROUND; + subq $16, NBLKS; /* Construct counter vectors X12 and X13 */ - vpbroadcastd (12 * 4)(INPUT), X12; + vpmovm2d %k1, X9; + vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12; vpbroadcastd (13 * 4)(INPUT), X13; - vpaddd COUNTER_ADD, X12, X12; vpcmpud $6, X12, COUNTER_ADD, %k2; - vpaddd .Lone rRIP {1to16}, X13, X13{%k2}; + vpsubd X9, X13, X13{%k2}; vmovdqa32 X12, X12_SAVE; vmovdqa32 X13, X13_SAVE; /* Load vectors */ vmovdqa32 S0, X0; vmovdqa32 S4, X4; vmovdqa32 S8, X8; vmovdqa32 S1, X1; vmovdqa32 S5, X5; vpbroadcastd (9 * 4)(INPUT), X9; - QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13) + QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13) vmovdqa32 S2, X2; vmovdqa32 S6, X6; vpbroadcastd (10 * 4)(INPUT), X10; vmovdqa32 S14, X14; vmovdqa32 S3, X3; vmovdqa32 S7, X7; vpbroadcastd (11 * 4)(INPUT), X11; vmovdqa32 S15, X15; /* Update counter */ addq $16, (12 * 4)(INPUT); - jmp .Lround2_entry; + jmp .Lround2_entry_16v; .align 16 -.Lround2: - QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14) - QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13) -.Lround2_entry: +.Loop16v: + movl $20, ROUND; + subq $16, NBLKS; + + vmovdqa32 S0, X0; + vmovdqa32 S4, X4; + vmovdqa32 S8, X8; + transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1); + xor_src_dst_4x4(DST, SRC, (64 * 1), 256, X1, X5, X9, X13); + vpmovm2d %k1, X9; + vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12; + vpbroadcastd (13 * 4)(INPUT), X13; + vpcmpud $6, X12, COUNTER_ADD, %k2; + vpsubd X9, X13, X13{%k2}; + vmovdqa32 S1, X1; + vmovdqa32 S5, X5; + vpbroadcastd (9 * 4)(INPUT), X9; + vmovdqa32 X12, X12_SAVE; + vmovdqa32 X13, X13_SAVE; + QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13) + transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1); + xor_src_dst_4x4(DST, SRC, (64 * 2), 256, X2, X6, X10, X14); + vmovdqa32 S2, X2; + vmovdqa32 S6, X6; + vpbroadcastd (10 * 4)(INPUT), X10; + vmovdqa32 S14, X14; + transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1); + xor_src_dst_4x4(DST, SRC, (64 * 3), 256, X3, X7, X11, X15); + leaq (16 * 64)(SRC), SRC; + leaq (16 * 64)(DST), DST; + vmovdqa32 S3, X3; + vmovdqa32 S7, X7; + vpbroadcastd (11 * 4)(INPUT), X11; + vmovdqa32 S15, X15; + + /* Update counter */ + addq $16, (12 * 4)(INPUT); + jmp .Lround2_entry_16v; + +.align 16 +.Lround2_16v: + QUARTERROUND2V(X2, X7, X8, X13, X3, X4, X9, X14) + QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13) +.align 16 +.Lround2_entry_16v: + QUARTERROUND2V(X2, X6, X10, X14, X3, X7, X11, X15) + QUARTERROUND2V(X0, X5, X10, X15, X1, X6, X11, X12) subl $2, ROUND; - QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15) - QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12) - jnz .Lround2; + jnz .Lround2_16v; -.Lround2_end: PLUS(X0, S0); PLUS(X1, S1); - PLUS(X5, S5); - PLUS(X6, S6); - PLUS(X10, (10 * 4)(INPUT){1to16}); - PLUS(X11, (11 * 4)(INPUT){1to16}); - PLUS(X15, S15); - PLUS(X12, X12_SAVE); - QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14) - + QUARTERROUND2V(X2, X7, X8, X13, X3, X4, X9, X14) PLUS(X2, S2); PLUS(X3, S3); + transpose_4x4(X0, X1, X2, X3, TMP0, TMP1); PLUS(X4, S4); + PLUS(X5, S5); + PLUS(X6, S6); PLUS(X7, S7); - transpose_4x4(X0, X1, X2, X3, TMP0, TMP1); transpose_4x4(X4, X5, X6, X7, TMP0, TMP1); PLUS(X8, S8); PLUS(X9, (9 * 4)(INPUT){1to16}); + PLUS(X10, (10 * 4)(INPUT){1to16}); + PLUS(X11, (11 * 4)(INPUT){1to16}); + transpose_4x4(X8, X9, X10, X11, TMP0, TMP1); + PLUS(X12, X12_SAVE); PLUS(X13, X13_SAVE); PLUS(X14, S14); - transpose_4x4(X8, X9, X10, X11, TMP0, TMP1); + PLUS(X15, S15); transpose_4x4(X12, X13, X14, X15, TMP0, TMP1); transpose_16byte_4x4(X0, X4, X8, X12, TMP0, TMP1); - xor_src_dst_4x4(DST, SRC, (64 * 0), (64 * 4), X0, X4, X8, X12); + xor_src_dst_4x4(DST, SRC, (64 * 0), 256, X0, X4, X8, X12); + + cmpq $16, NBLKS; + jae .Loop16v; + transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1); - xor_src_dst_4x4(DST, SRC, (64 * 1), (64 * 4), X1, X5, X9, X13); + xor_src_dst_4x4(DST, SRC, (64 * 1), 256, X1, X5, X9, X13); transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1); - xor_src_dst_4x4(DST, SRC, (64 * 2), (64 * 4), X2, X6, X10, X14); + xor_src_dst_4x4(DST, SRC, (64 * 2), 256, X2, X6, X10, X14); transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1); - xor_src_dst_4x4(DST, SRC, (64 * 3), (64 * 4), X3, X7, X11, X15); + xor_src_dst_4x4(DST, SRC, (64 * 3), 256, X3, X7, X11, X15); - subq $16, NBLKS; leaq (16 * 64)(SRC), SRC; leaq (16 * 64)(DST), DST; - jnz .Loop16; - /* clear the used vector registers */ +.align 16 +.Lskip16v: + cmpq $8, NBLKS; + jb .Lskip8v; + + /* Process 8 ChaCha20 blocks */ + + /* Construct counter vectors X12 and X13 */ + vpmovm2d %k1, X9y; + vpaddd (12 * 4)(INPUT){1to8}, COUNTER_ADDy, X12y; + vpbroadcastd (13 * 4)(INPUT), X13y; + vpcmpud $6, X12y, COUNTER_ADDy, %k2; + vpsubd X9y, X13y, X13y{%k2}; + vmovdqa32 X12y, X12_SAVEy; + vmovdqa32 X13y, X13_SAVEy; + + /* Load vectors */ + vmovdqa32 S0y, X0y; + vmovdqa32 S4y, X4y; + vmovdqa32 S8y, X8y; + vmovdqa32 S1y, X1y; + vmovdqa32 S5y, X5y; + vpbroadcastd (9 * 4)(INPUT), X9y; + vmovdqa32 S2y, X2y; + vmovdqa32 S6y, X6y; + vpbroadcastd (10 * 4)(INPUT), X10y; + vmovdqa32 S14y, X14y; + vmovdqa32 S3y, X3y; + vmovdqa32 S7y, X7y; + vpbroadcastd (11 * 4)(INPUT), X11y; + vmovdqa32 S15y, X15y; + + /* Update counter */ + addq $8, (12 * 4)(INPUT); + + movl $20, ROUND; + subq $8, NBLKS; +.align 16 +.Lround2_8v: + QUARTERROUND2V(X0y, X4y, X8y, X12y, X1y, X5y, X9y, X13y) + QUARTERROUND2V(X2y, X6y, X10y, X14y, X3y, X7y, X11y, X15y) + QUARTERROUND2V(X0y, X5y, X10y, X15y, X1y, X6y, X11y, X12y) + QUARTERROUND2V(X2y, X7y, X8y, X13y, X3y, X4y, X9y, X14y) + subl $2, ROUND; + jnz .Lround2_8v; + + PLUS(X0y, S0y); + PLUS(X1y, S1y); + PLUS(X2y, S2y); + PLUS(X3y, S3y); + transpose_4x4(X0y, X1y, X2y, X3y, TMP0y, TMP1y); + PLUS(X4y, S4y); + PLUS(X5y, S5y); + PLUS(X6y, S6y); + PLUS(X7y, S7y); + transpose_4x4(X4y, X5y, X6y, X7y, TMP0y, TMP1y); + PLUS(X8y, S8y); + transpose_16byte_2x2(X0y, X4y, TMP0y); + PLUS(X9y, (9 * 4)(INPUT){1to8}); + transpose_16byte_2x2(X1y, X5y, TMP0y); + PLUS(X10y, (10 * 4)(INPUT){1to8}); + transpose_16byte_2x2(X2y, X6y, TMP0y); + PLUS(X11y, (11 * 4)(INPUT){1to8}); + transpose_16byte_2x2(X3y, X7y, TMP0y); + xor_src_dst_4x4(DST, SRC, (16 * 0), 64, X0y, X1y, X2y, X3y); + transpose_4x4(X8y, X9y, X10y, X11y, TMP0y, TMP1y); + PLUS(X12y, X12_SAVEy); + PLUS(X13y, X13_SAVEy); + PLUS(X14y, S14y); + PLUS(X15y, S15y); + xor_src_dst_4x4(DST, SRC, (16 * 16), 64, X4y, X5y, X6y, X7y); + transpose_4x4(X12y, X13y, X14y, X15y, TMP0y, TMP1y); + transpose_16byte_2x2(X8y, X12y, TMP0y); + transpose_16byte_2x2(X9y, X13y, TMP0y); + transpose_16byte_2x2(X10y, X14y, TMP0y); + transpose_16byte_2x2(X11y, X15y, TMP0y); + xor_src_dst_4x4(DST, SRC, (16 * 2), 64, X8y, X9y, X10y, X11y); + xor_src_dst_4x4(DST, SRC, (16 * 18), 64, X12y, X13y, X14y, X15y); + + leaq (8 * 64)(SRC), SRC; + leaq (8 * 64)(DST), DST; + +.align 16 +.Lskip8v: + cmpq $4, NBLKS; + jb .Lskip4v; + + /* Process 4 ChaCha20 blocks */ + + /* Construct counter vectors X12 and X13 */ + vpmovm2d %k1, X9x; + vpaddd (12 * 4)(INPUT){1to4}, COUNTER_ADDx, X12x; + vpbroadcastd (13 * 4)(INPUT), X13x; + vpcmpud $6, X12x, COUNTER_ADDx, %k2; + vpsubd X9x, X13x, X13x{%k2}; + vmovdqa32 X12x, X12_SAVEx; + vmovdqa32 X13x, X13_SAVEx; + + /* Load vectors */ + vmovdqa32 S0x, X0x; + vmovdqa32 S4x, X4x; + vmovdqa32 S8x, X8x; + vmovdqa32 S1x, X1x; + vmovdqa32 S5x, X5x; + vpbroadcastd (9 * 4)(INPUT), X9x; + vmovdqa32 S2x, X2x; + vmovdqa32 S6x, X6x; + vpbroadcastd (10 * 4)(INPUT), X10x; + vmovdqa32 S14x, X14x; + vmovdqa32 S3x, X3x; + vmovdqa32 S7x, X7x; + vpbroadcastd (11 * 4)(INPUT), X11x; + vmovdqa32 S15x, X15x; + + /* Update counter */ + addq $4, (12 * 4)(INPUT); + + movl $20, ROUND; + subq $4, NBLKS; +.align 16 +.Lround2_4v: + QUARTERROUND2V(X0x, X4x, X8x, X12x, X1x, X5x, X9x, X13x) + QUARTERROUND2V(X2x, X6x, X10x, X14x, X3x, X7x, X11x, X15x) + QUARTERROUND2V(X0x, X5x, X10x, X15x, X1x, X6x, X11x, X12x) + QUARTERROUND2V(X2x, X7x, X8x, X13x, X3x, X4x, X9x, X14x) + subl $2, ROUND; + jnz .Lround2_4v; + + PLUS(X0x, S0x); + PLUS(X1x, S1x); + PLUS(X2x, S2x); + PLUS(X3x, S3x); + transpose_4x4(X0x, X1x, X2x, X3x, TMP0x, TMP1x); + PLUS(X4x, S4x); + PLUS(X5x, S5x); + PLUS(X6x, S6x); + PLUS(X7x, S7x); + xor_src_dst_4x4(DST, SRC, (16 * 0), 64, X0x, X1x, X2x, X3x); + transpose_4x4(X4x, X5x, X6x, X7x, TMP0x, TMP1x); + PLUS(X8x, S8x); + PLUS(X9x, (9 * 4)(INPUT){1to4}); + PLUS(X10x, (10 * 4)(INPUT){1to4}); + PLUS(X11x, (11 * 4)(INPUT){1to4}); + xor_src_dst_4x4(DST, SRC, (16 * 1), 64, X4x, X5x, X6x, X7x); + transpose_4x4(X8x, X9x, X10x, X11x, TMP0x, TMP1x); + PLUS(X12x, X12_SAVEx); + PLUS(X13x, X13_SAVEx); + PLUS(X14x, S14x); + PLUS(X15x, S15x); + xor_src_dst_4x4(DST, SRC, (16 * 2), 64, X8x, X9x, X10x, X11x); + transpose_4x4(X12x, X13x, X14x, X15x, TMP0x, TMP1x); + xor_src_dst_4x4(DST, SRC, (16 * 3), 64, X12x, X13x, X14x, X15x); + + leaq (4 * 64)(SRC), SRC; + leaq (4 * 64)(DST), DST; + +.align 16 +.Lskip4v: + /* clear AVX512 registers */ + kxorq %k2, %k2, %k2; + vzeroupper; clear_zmm16_zmm31(); - kxord %k2, %k2, %k2; + +.align 16 +.Lskip_vertical_handling: + cmpq $0, NBLKS; + je .Ldone; + + /* Load state */ + vmovdqu (0 * 4)(INPUT), X10x; + vmovdqu (4 * 4)(INPUT), X11x; + vmovdqu (8 * 4)(INPUT), X12x; + vmovdqu (12 * 4)(INPUT), X13x; + + /* Load constant */ + vmovdqa .Lone rRIP, X4x; + + cmpq $1, NBLKS; + je .Lhandle1; + + /* Process two ChaCha20 blocks (XMM) */ + movl $20, ROUND; + subq $2, NBLKS; + + vmovdqa X10x, X0x; + vmovdqa X11x, X1x; + vmovdqa X12x, X2x; + vmovdqa X13x, X3x; + + vmovdqa X10x, X8x; + vmovdqa X11x, X9x; + vmovdqa X12x, X14x; + vpaddq X4x, X13x, X15x; + vmovdqa X15x, X7x; + +.align 16 +.Lround2_2: + QUARTERROUND2H(X0x, X1x, X2x, X3x, X8x, X9x, X14x, X15x, + 0x39, 0x4e, 0x93); + QUARTERROUND2H(X0x, X1x, X2x, X3x, X8x, X9x, X14x, X15x, + 0x93, 0x4e, 0x39); + subl $2, ROUND; + jnz .Lround2_2; + + PLUS(X0x, X10x); + PLUS(X1x, X11x); + PLUS(X2x, X12x); + PLUS(X3x, X13x); + + vpaddq .Ltwo rRIP, X13x, X13x; /* Update counter */ + + xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x); + + PLUS(X8x, X10x); + PLUS(X9x, X11x); + PLUS(X14x, X12x); + PLUS(X15x, X7x); + + xor_src_dst_4x4(DST, SRC, 16 * 4, 4 * 4, X8x, X9x, X14x, X15x); + lea (2 * 64)(DST), DST; + lea (2 * 64)(SRC), SRC; + + cmpq $0, NBLKS; + je .Lskip1; + +.align 16 +.Lhandle1: + /* Process one ChaCha20 block (XMM) */ + movl $20, ROUND; + subq $1, NBLKS; + + vmovdqa X10x, X0x; + vmovdqa X11x, X1x; + vmovdqa X12x, X2x; + vmovdqa X13x, X3x; + +.align 16 +.Lround2_1: + QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x39, 0x4e, 0x93); + QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x93, 0x4e, 0x39); + subl $2, ROUND; + jnz .Lround2_1; + + PLUS(X0x, X10x); + PLUS(X1x, X11x); + PLUS(X2x, X12x); + PLUS(X3x, X13x); + + vpaddq X4x, X13x, X13x; /* Update counter */ + + xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x); + +.align 16 +.Lskip1: + /* Store counter */ + vmovdqu X13x, (12 * 4)(INPUT); + +.align 16 +.Ldone: vzeroall; /* clears ZMM0-ZMM15 */ - /* eax zeroed by round loop. */ + xorl %eax, %eax; ret_spec_stop; CFI_ENDPROC(); -ELF(.size _gcry_chacha20_amd64_avx512_blocks16, - .-_gcry_chacha20_amd64_avx512_blocks16;) +ELF(.size _gcry_chacha20_amd64_avx512_blocks, + .-_gcry_chacha20_amd64_avx512_blocks;) #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index f0cb8721..a7e0dd63 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -1,1407 +1,1413 @@ /* chacha20.c - Bernstein's ChaCha20 cipher * Copyright (C) 2014,2017-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * For a description of the algorithm, see: * http://cr.yp.to/chacha.html */ /* * Based on D. J. Bernstein reference implementation at * http://cr.yp.to/chacha.html: * * chacha-regs.c version 20080118 * D. J. Bernstein * Public domain. */ #include #include #include #include #include "types.h" #include "g10lib.h" #include "cipher.h" #include "cipher-internal.h" #include "bufhelp.h" #define CHACHA20_MIN_KEY_SIZE 16 /* Bytes. */ #define CHACHA20_MAX_KEY_SIZE 32 /* Bytes. */ #define CHACHA20_BLOCK_SIZE 64 /* Bytes. */ #define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */ #define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */ #define CHACHA20_CTR_SIZE 16 /* Bytes. */ /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSSE3 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ #undef USE_AVX2 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX2 1 #endif /* USE_AVX512 indicates whether to compile with Intel AVX512 code. */ #undef USE_AVX512 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AVX512 1 #endif /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */ #undef USE_ARMV7_NEON #ifdef ENABLE_NEON_SUPPORT # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_NEON) # define USE_ARMV7_NEON 1 # endif #endif /* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly * code. */ #undef USE_AARCH64_SIMD #ifdef ENABLE_NEON_SUPPORT # if defined(__AARCH64EL__) \ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) # define USE_AARCH64_SIMD 1 # endif #endif /* USE_PPC_VEC indicates whether to enable PowerPC vector * accelerated code. */ #undef USE_PPC_VEC #ifdef ENABLE_PPC_CRYPTO_SUPPORT # if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) # if __GNUC__ >= 4 # define USE_PPC_VEC 1 # endif # endif #endif /* USE_S390X_VX indicates whether to enable zSeries code. */ #undef USE_S390X_VX #if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 # if defined(HAVE_GCC_INLINE_ASM_S390X_VX) # define USE_S390X_VX 1 # endif /* USE_S390X_VX */ #endif /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) #else # define ASM_FUNC_ABI #endif typedef struct CHACHA20_context_s { u32 input[16]; unsigned char pad[CHACHA20_BLOCK_SIZE]; unsigned int unused; /* bytes in the pad. */ unsigned int use_ssse3:1; unsigned int use_avx2:1; unsigned int use_avx512:1; unsigned int use_neon:1; unsigned int use_ppc:1; unsigned int use_p10:1; unsigned int use_s390x:1; } CHACHA20_context_t; #ifdef USE_SSSE3 unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; #endif /* USE_SSSE3 */ #ifdef USE_AVX2 unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst, const byte *src, size_t nblks) ASM_FUNC_ABI; unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI; #endif /* USE_AVX2 */ #ifdef USE_AVX512 -unsigned int _gcry_chacha20_amd64_avx512_blocks16(u32 *state, byte *dst, - const byte *src, - size_t nblks) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_amd64_avx512_blocks(u32 *state, byte *dst, + const byte *src, + size_t nblks) ASM_FUNC_ABI; #endif /* USE_AVX2 */ #ifdef USE_PPC_VEC #ifndef WORDS_BIGENDIAN unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len); #endif unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks); #undef USE_PPC_VEC_POLY1305 #if SIZEOF_UNSIGNED_LONG == 8 #define USE_PPC_VEC_POLY1305 1 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); #endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* USE_PPC_VEC */ #ifdef USE_S390X_VX unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst, const byte *src, size_t nblks); unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst, const byte *src, size_t nblks); #undef USE_S390X_VX_POLY1305 #if SIZEOF_UNSIGNED_LONG == 8 #define USE_S390X_VX_POLY1305 1 unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); #endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* USE_S390X_VX */ #ifdef USE_ARMV7_NEON unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); #endif /* USE_ARMV7_NEON */ #ifdef USE_AARCH64_SIMD unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks); unsigned int _gcry_chacha20_poly1305_aarch64_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, void *poly1305_state, const byte *poly1305_src); #endif /* USE_AARCH64_SIMD */ static const char *selftest (void); #define ROTATE(v,c) (rol(v,c)) #define XOR(v,w) ((v) ^ (w)) #define PLUS(v,w) ((u32)((v) + (w))) #define PLUSONE(v) (PLUS((v),1)) #define QUARTERROUND(a,b,c,d) \ a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); #define BUF_XOR_LE32(dst, src, offset, x) \ buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x)) static unsigned int do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks) { u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; unsigned int i; while (nblks) { x0 = input[0]; x1 = input[1]; x2 = input[2]; x3 = input[3]; x4 = input[4]; x5 = input[5]; x6 = input[6]; x7 = input[7]; x8 = input[8]; x9 = input[9]; x10 = input[10]; x11 = input[11]; x12 = input[12]; x13 = input[13]; x14 = input[14]; x15 = input[15]; for (i = 20; i > 0; i -= 2) { QUARTERROUND(x0, x4, x8, x12) QUARTERROUND(x1, x5, x9, x13) QUARTERROUND(x2, x6, x10, x14) QUARTERROUND(x3, x7, x11, x15) QUARTERROUND(x0, x5, x10, x15) QUARTERROUND(x1, x6, x11, x12) QUARTERROUND(x2, x7, x8, x13) QUARTERROUND(x3, x4, x9, x14) } x0 = PLUS(x0, input[0]); x1 = PLUS(x1, input[1]); x2 = PLUS(x2, input[2]); x3 = PLUS(x3, input[3]); x4 = PLUS(x4, input[4]); x5 = PLUS(x5, input[5]); x6 = PLUS(x6, input[6]); x7 = PLUS(x7, input[7]); x8 = PLUS(x8, input[8]); x9 = PLUS(x9, input[9]); x10 = PLUS(x10, input[10]); x11 = PLUS(x11, input[11]); x12 = PLUS(x12, input[12]); x13 = PLUS(x13, input[13]); x14 = PLUS(x14, input[14]); x15 = PLUS(x15, input[15]); input[12] = PLUSONE(input[12]); input[13] = PLUS(input[13], !input[12]); BUF_XOR_LE32(dst, src, 0, x0); BUF_XOR_LE32(dst, src, 4, x1); BUF_XOR_LE32(dst, src, 8, x2); BUF_XOR_LE32(dst, src, 12, x3); BUF_XOR_LE32(dst, src, 16, x4); BUF_XOR_LE32(dst, src, 20, x5); BUF_XOR_LE32(dst, src, 24, x6); BUF_XOR_LE32(dst, src, 28, x7); BUF_XOR_LE32(dst, src, 32, x8); BUF_XOR_LE32(dst, src, 36, x9); BUF_XOR_LE32(dst, src, 40, x10); BUF_XOR_LE32(dst, src, 44, x11); BUF_XOR_LE32(dst, src, 48, x12); BUF_XOR_LE32(dst, src, 52, x13); BUF_XOR_LE32(dst, src, 56, x14); BUF_XOR_LE32(dst, src, 60, x15); src += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; nblks--; } /* burn_stack */ return (17 * sizeof(u32) + 6 * sizeof(void *)); } static unsigned int chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, size_t nblks) { +#ifdef USE_AVX512 + if (ctx->use_avx512) + { + return _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks); + } +#endif + #ifdef USE_SSSE3 if (ctx->use_ssse3) { return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks); } #endif #ifdef USE_PPC_VEC if (ctx->use_ppc) { return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks); } #endif #ifdef USE_S390X_VX if (ctx->use_s390x) { return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks); } #endif return do_chacha20_blocks (ctx->input, dst, src, nblks); } static void chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static const char sigma[16] = "expand 32-byte k"; static const char tau[16] = "expand 16-byte k"; const char *constants; ctx->input[4] = buf_get_le32(key + 0); ctx->input[5] = buf_get_le32(key + 4); ctx->input[6] = buf_get_le32(key + 8); ctx->input[7] = buf_get_le32(key + 12); if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */ { key += 16; constants = sigma; } else /* 128 bits */ { constants = tau; } ctx->input[8] = buf_get_le32(key + 0); ctx->input[9] = buf_get_le32(key + 4); ctx->input[10] = buf_get_le32(key + 8); ctx->input[11] = buf_get_le32(key + 12); ctx->input[0] = buf_get_le32(constants + 0); ctx->input[1] = buf_get_le32(constants + 4); ctx->input[2] = buf_get_le32(constants + 8); ctx->input[3] = buf_get_le32(constants + 12); } static void chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen) { if (ivlen == CHACHA20_CTR_SIZE) { ctx->input[12] = buf_get_le32 (iv + 0); ctx->input[13] = buf_get_le32 (iv + 4); ctx->input[14] = buf_get_le32 (iv + 8); ctx->input[15] = buf_get_le32 (iv + 12); } else if (ivlen == CHACHA20_MAX_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = buf_get_le32 (iv + 0); ctx->input[14] = buf_get_le32 (iv + 4); ctx->input[15] = buf_get_le32 (iv + 8); } else if (ivlen == CHACHA20_MIN_IV_SIZE) { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = buf_get_le32 (iv + 0); ctx->input[15] = buf_get_le32 (iv + 4); } else { ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = 0; ctx->input[15] = 0; } } static void chacha20_setiv (void *context, const byte *iv, size_t ivlen) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */ if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE && ivlen != CHACHA20_CTR_SIZE) log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen); if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE || ivlen == CHACHA20_CTR_SIZE)) chacha20_ivsetup (ctx, iv, ivlen); else chacha20_ivsetup (ctx, NULL, 0); /* Reset the unused pad bytes counter. */ ctx->unused = 0; } static gcry_err_code_t chacha20_do_setkey (CHACHA20_context_t *ctx, const byte *key, unsigned int keylen) { static int initialized; static const char *selftest_failed; unsigned int features = _gcry_get_hw_features (); if (!initialized) { initialized = 1; selftest_failed = selftest (); if (selftest_failed) log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed); } if (selftest_failed) return GPG_ERR_SELFTEST_FAILED; if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE) return GPG_ERR_INV_KEYLEN; #ifdef USE_SSSE3 ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif #ifdef USE_AVX512 ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0; #endif #ifdef USE_AVX2 ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0; #endif #ifdef USE_ARMV7_NEON ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_AARCH64_SIMD ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_PPC_VEC ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; # ifndef WORDS_BIGENDIAN ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; # ifdef ENABLE_FORCE_SOFT_HWFEATURES /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0; # endif # endif #endif #ifdef USE_S390X_VX ctx->use_s390x = (features & HWF_S390X_VX) != 0; #endif (void)features; chacha20_keysetup (ctx, key, keylen); /* We default to a zero nonce. */ chacha20_setiv (ctx, NULL, 0); return 0; } static gcry_err_code_t chacha20_setkey (void *context, const byte *key, unsigned int keylen, cipher_bulk_ops_t *bulk_ops) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen); (void)bulk_ops; _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); return rc; } static unsigned int do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, const byte *inbuf, size_t length) { static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, }; unsigned int nburn, burn = 0; #ifdef USE_AVX512 - if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE * 16) + if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; - nblocks -= nblocks % 16; - nburn = _gcry_chacha20_amd64_avx512_blocks16(ctx->input, outbuf, inbuf, - nblocks); + nburn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, outbuf, inbuf, + nblocks); burn = nburn > burn ? nburn : burn; - length -= nblocks * CHACHA20_BLOCK_SIZE; + length %= CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AVX2 if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_ARMV7_NEON if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AARCH64_SIMD if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; #ifndef WORDS_BIGENDIAN /* * A workaround to skip counter overflow. This is rare. */ if (ctx->use_p10 && nblocks >= 8 && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU) { size_t len = nblocks * CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len); } else #endif { nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks); } burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; - length -= nblocks * CHACHA20_BLOCK_SIZE; + length %= CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length > 0) { nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, ctx->pad, length); ctx->unused = CHACHA20_BLOCK_SIZE - length; } if (burn) burn += 5 * sizeof(void *); return burn; } static void chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (CHACHA20_context_t *) context; unsigned int nburn, burn = 0; if (!length) return; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) return; gcry_assert (!ctx->unused); } nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length); burn = nburn > burn ? nburn : burn; if (burn) _gcry_burn_stack (burn); } gcry_err_code_t _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (void *) &c->context.c; unsigned int nburn, burn = 0; byte *authptr = NULL; if (!length) return 0; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n); burn = nburn > burn ? nburn : burn; length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) { if (burn) _gcry_burn_stack (burn); return 0; } gcry_assert (!ctx->unused); } gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); if (0) { } #ifdef USE_AVX512 else if (ctx->use_avx512) { /* Skip stitched chacha20-poly1305 for AVX512. */ authptr = NULL; } #endif #ifdef USE_AVX2 else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8) { nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 8 * CHACHA20_BLOCK_SIZE; outbuf += 8 * CHACHA20_BLOCK_SIZE; inbuf += 8 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 2 * CHACHA20_BLOCK_SIZE; outbuf += 2 * CHACHA20_BLOCK_SIZE; inbuf += 2 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE) { nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 1 * CHACHA20_BLOCK_SIZE; outbuf += 1 * CHACHA20_BLOCK_SIZE; inbuf += 1 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_AARCH64_SIMD else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC_POLY1305 else if (ctx->use_ppc && ctx->use_p10) { /* Skip stitched chacha20-poly1305 for P10. */ authptr = NULL; } else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX_POLY1305 else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8) { nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 8 * CHACHA20_BLOCK_SIZE; outbuf += 8 * CHACHA20_BLOCK_SIZE; inbuf += 8 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4) { nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 4 * CHACHA20_BLOCK_SIZE; outbuf += 4 * CHACHA20_BLOCK_SIZE; inbuf += 4 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2) { nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 2 * CHACHA20_BLOCK_SIZE; outbuf += 2 * CHACHA20_BLOCK_SIZE; inbuf += 2 * CHACHA20_BLOCK_SIZE; } else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE) { nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1); burn = nburn > burn ? nburn : burn; authptr = outbuf; length -= 1 * CHACHA20_BLOCK_SIZE; outbuf += 1 * CHACHA20_BLOCK_SIZE; inbuf += 1 * CHACHA20_BLOCK_SIZE; } #endif if (authptr) { size_t authoffset = outbuf - authptr; #ifdef USE_AVX2 if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE && authoffset >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (ctx->use_ssse3) { if (length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE && authoffset >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } } #endif #ifdef USE_AARCH64_SIMD if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_aarch64_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC_POLY1305 if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE && authoffset >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_ppc8_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX_POLY1305 if (ctx->use_s390x) { if (length >= 8 * CHACHA20_BLOCK_SIZE && authoffset >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; burn = _gcry_chacha20_poly1305_s390x_vx_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE && authoffset >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; authptr += nblocks * CHACHA20_BLOCK_SIZE; } } #endif if (authoffset > 0) { _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset); authptr += authoffset; authoffset = 0; } gcry_assert(authptr == outbuf); } while (length) { size_t currlen = length; /* Since checksumming is done after encryption, process input in 24KiB * chunks to keep data loaded in L1 cache for checksumming. However * only do splitting if input is large enough so that last chunks does * not end up being short. */ if (currlen > 32 * 1024) currlen = 24 * 1024; nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen); burn = nburn > burn ? nburn : burn; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, currlen); burn = nburn > burn ? nburn : burn; outbuf += currlen; inbuf += currlen; length -= currlen; } if (burn) _gcry_burn_stack (burn); return 0; } gcry_err_code_t _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length) { CHACHA20_context_t *ctx = (void *) &c->context.c; unsigned int nburn, burn = 0; #if defined(USE_AVX512) || defined(USE_PPC_VEC_POLY1305) \ || defined(USE_AVX2) || defined(USE_SSSE3) || defined(USE_AARCH64_SIMD) \ || defined(USE_S390X_VX_POLY1305) int skip_stitched = 0; #endif if (!length) return 0; if (ctx->unused) { unsigned char *p = ctx->pad; size_t n; gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE); n = ctx->unused; if (n > length) n = length; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n); burn = nburn > burn ? nburn : burn; buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n); length -= n; outbuf += n; inbuf += n; ctx->unused -= n; if (!length) { if (burn) _gcry_burn_stack (burn); return 0; } gcry_assert (!ctx->unused); } gcry_assert (c->u_mode.poly1305.ctx.leftover == 0); #ifdef USE_AVX512 if (ctx->use_avx512) { /* Skip stitched chacha20-poly1305 for AVX512. */ skip_stitched = 1; } #endif #ifdef USE_PPC_VEC_POLY1305 if (ctx->use_ppc && ctx->use_p10) { /* Skip stitched chacha20-poly1305 for P10. */ skip_stitched = 1; } #endif #ifdef USE_AVX2 if (!skip_stitched && ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_SSSE3 if (!skip_stitched && ctx->use_ssse3) { if (length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } } #endif #ifdef USE_AARCH64_SIMD if (!skip_stitched && ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_aarch64_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_PPC_VEC_POLY1305 /* skip stitch for p10 */ if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; nburn = _gcry_chacha20_poly1305_ppc8_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } #endif #ifdef USE_S390X_VX_POLY1305 if (!skip_stitched && ctx->use_s390x) { if (length >= 8 * CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 8; nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } } #endif while (length) { size_t currlen = length; /* Since checksumming is done before decryption, process input in 24KiB * chunks to keep data loaded in L1 cache for decryption. However only * do splitting if input is large enough so that last chunks does not * end up being short. */ if (currlen > 32 * 1024) currlen = 24 * 1024; nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, currlen); burn = nburn > burn ? nburn : burn; nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen); burn = nburn > burn ? nburn : burn; outbuf += currlen; inbuf += currlen; length -= currlen; } if (burn) _gcry_burn_stack (burn); return 0; } static const char * selftest (void) { byte ctxbuf[sizeof(CHACHA20_context_t) + 15]; CHACHA20_context_t *ctx; byte scratch[127 + 1]; byte buf[512 + 64 + 4]; int i; /* From draft-strombergson-chacha-test-vectors */ static byte key_1[] = { 0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78, 0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35, 0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb, 0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d }; static const byte nonce_1[] = { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 }; static const byte plaintext_1[127] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static const byte ciphertext_1[127] = { 0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9, 0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06, 0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00, 0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf, 0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd, 0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f, 0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f, 0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92, 0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9, 0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36, 0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1, 0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38, 0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea, 0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0, 0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27, 0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33 }; /* 16-byte alignment required for amd64 implementation. */ ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15); chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); scratch[sizeof (scratch) - 1] = 0; chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1); if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1)) return "ChaCha20 encryption test 1 failed."; if (scratch[sizeof (scratch) - 1]) return "ChaCha20 wrote too much."; chacha20_setkey (ctx, key_1, sizeof (key_1), NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1); if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) return "ChaCha20 decryption test 1 failed."; for (i = 0; i < sizeof buf; i++) buf[i] = i; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /*encrypt */ chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); /*decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, 1); chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1); chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1, buf + (sizeof buf) - 1, 1); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 2 failed."; chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); /* encrypt */ for (i = 0; i < sizeof buf; i++) chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1); /* decrypt */ chacha20_setkey (ctx, key_1, sizeof key_1, NULL); chacha20_setiv (ctx, nonce_1, sizeof nonce_1); chacha20_encrypt_stream (ctx, buf, buf, sizeof buf); for (i = 0; i < sizeof buf; i++) if (buf[i] != (byte) i) return "ChaCha20 encryption test 3 failed."; return NULL; } gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = { GCRY_CIPHER_CHACHA20, {0, 0}, /* flags */ "CHACHA20", /* name */ NULL, /* aliases */ NULL, /* oids */ 1, /* blocksize in bytes. */ CHACHA20_MAX_KEY_SIZE * 8, /* standard key length in bits. */ sizeof (CHACHA20_context_t), chacha20_setkey, NULL, NULL, chacha20_encrypt_stream, chacha20_encrypt_stream, NULL, NULL, chacha20_setiv };