diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index b7324e8f..78a9e338 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -1,1525 +1,2025 @@ /* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode * implementation * Copyright (C) 2013-2014,2019,2022 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #include #include #include #include #include #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" #include "./cipher-internal.h" #ifdef GCM_USE_INTEL_PCLMUL #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */ /* Prevent compiler from issuing SSE instructions between asm blocks. */ # pragma GCC target("no-sse") #endif #if __clang__ # pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function) #endif #define ALWAYS_INLINE inline __attribute__((always_inline)) #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE #define GCM_INTEL_USE_VPCLMUL_AVX2 (1 << 0) #define GCM_INTEL_AGGR8_TABLE_INITIALIZED (1 << 1) #define GCM_INTEL_AGGR16_TABLE_INITIALIZED (1 << 2) +#define GCM_INTEL_USE_VPCLMUL_AVX512 (1 << 3) +#define GCM_INTEL_AGGR32_TABLE_INITIALIZED (1 << 4) /* Intel PCLMUL ghash based on white paper: "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. */ static ASM_FUNC_ATTR_INLINE void reduction(void) { /* input: */ asm volatile (/* first phase of the reduction */ "movdqa %%xmm3, %%xmm6\n\t" "movdqa %%xmm3, %%xmm5\n\t" "psllq $1, %%xmm6\n\t" /* packed right shifting << 63 */ "pxor %%xmm3, %%xmm6\n\t" "psllq $57, %%xmm5\n\t" /* packed right shifting << 57 */ "psllq $62, %%xmm6\n\t" /* packed right shifting << 62 */ "pxor %%xmm5, %%xmm6\n\t" /* xor the shifted versions */ "pshufd $0x6a, %%xmm6, %%xmm5\n\t" "pshufd $0xae, %%xmm6, %%xmm6\n\t" "pxor %%xmm5, %%xmm3\n\t" /* first phase of the reduction complete */ /* second phase of the reduction */ "pxor %%xmm3, %%xmm1\n\t" /* xor the shifted versions */ "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 1 */ "pxor %%xmm3, %%xmm6\n\t" "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 2 */ "pxor %%xmm3, %%xmm1\n\t" "psrlq $5, %%xmm3\n\t" /* packed left shifting >> 7 */ "pxor %%xmm3, %%xmm6\n\t" "pxor %%xmm6, %%xmm1\n\t" /* the result is in xmm1 */ ::: "memory" ); } static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void) { /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified. Input must be converted to little-endian. */ asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */ "pshufd $78, %%xmm0, %%xmm2\n\t" "pshufd $78, %%xmm1, %%xmm4\n\t" "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */ "movdqa %%xmm0, %%xmm3\n\t" "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */ "pclmulqdq $17, %%xmm0, %%xmm1\n\t" /* xmm6 holds a1*b1 */ "movdqa %%xmm3, %%xmm5\n\t" "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */ "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ "movdqa %%xmm4, %%xmm5\n\t" "psrldq $8, %%xmm4\n\t" "pslldq $8, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the carry-less multiplication of xmm0 by xmm1 */ ::: "memory" ); reduction(); } #define GFMUL_AGGR4_ASM_1(be_to_le) \ /* perform clmul and merge results... */ \ "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */ \ "movdqu 0*16(%[buf]), %%xmm5\n\t" \ be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */ \ "pxor %%xmm5, %%xmm1\n\t" \ \ "pshufd $78, %%xmm2, %%xmm5\n\t" \ "pshufd $78, %%xmm1, %%xmm4\n\t" \ "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */ \ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */ \ "movdqa %%xmm2, %%xmm3\n\t" \ "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 4:a0*b0 */ \ "pclmulqdq $17, %%xmm2, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ \ "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 4:(a0+a1)*(b0+b1) */ \ \ "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */ \ "movdqu 1*16(%[buf]), %%xmm2\n\t" \ be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */ \ \ "pshufd $78, %%xmm5, %%xmm0\n\t" \ "pshufd $78, %%xmm2, %%xmm7\n\t" \ "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */ \ "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ \ "movdqa %%xmm5, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ \ "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ \ "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ \ \ "movdqu 2*16(%[buf]), %%xmm5\n\t" \ be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */ \ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */ \ \ "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */ \ \ "pshufd $78, %%xmm2, %%xmm0\n\t" \ "pshufd $78, %%xmm5, %%xmm7\n\t" \ "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */ \ "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ \ "movdqa %%xmm2, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ \ "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ \ "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ \ \ "movdqu 3*16(%[buf]), %%xmm2\n\t" \ be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */ \ "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */ #define GFMUL_AGGR4_ASM_2() \ "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */ \ \ "pshufd $78, %%xmm5, %%xmm0\n\t" \ "pshufd $78, %%xmm2, %%xmm7\n\t" \ "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */ \ "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ \ "movdqa %%xmm5, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ \ "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ \ "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ \ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ \ \ /* aggregated reduction... */ \ "movdqa %%xmm3, %%xmm5\n\t" \ "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ \ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \ "movdqa %%xmm4, %%xmm5\n\t" \ "psrldq $8, %%xmm4\n\t" \ "pslldq $8, %%xmm5\n\t" \ "pxor %%xmm5, %%xmm3\n\t" \ "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the \ carry-less multiplication of xmm0 \ by xmm1 */ #define be_to_le(...) __VA_ARGS__ #define le_to_le(...) /*_*/ static ASM_FUNC_ATTR_INLINE void gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table, const unsigned char *be_mask) { /* Input: Hash: XMM1 Output: Hash: XMM1 */ asm volatile (GFMUL_AGGR4_ASM_1(be_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table), [be_mask] "m" (*be_mask) : "memory" ); asm volatile (GFMUL_AGGR4_ASM_2() : : [h_1] "m" (*(const unsigned char *)h_1) : "memory" ); reduction(); } static ASM_FUNC_ATTR_INLINE void gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table) { /* Input: Hash: XMM1 Output: Hash: XMM1 */ asm volatile (GFMUL_AGGR4_ASM_1(le_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table) : "memory" ); asm volatile (GFMUL_AGGR4_ASM_2() : : [h_1] "m" (*(const unsigned char *)h_1) : "memory" ); reduction(); } #ifdef __x86_64__ #define GFMUL_AGGR8_ASM(be_to_le) \ /* Load H6, H7, H8. */ \ "movdqu 6*16(%[h_table]), %%xmm10\n\t" \ "movdqu 5*16(%[h_table]), %%xmm9\n\t" \ "movdqu 4*16(%[h_table]), %%xmm8\n\t" \ \ /* perform clmul and merge results... */ \ "movdqu 0*16(%[buf]), %%xmm5\n\t" \ "movdqu 1*16(%[buf]), %%xmm2\n\t" \ be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ "pxor %%xmm5, %%xmm1\n\t" \ \ "pshufd $78, %%xmm10, %%xmm5\n\t" \ "pshufd $78, %%xmm1, %%xmm4\n\t" \ "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */ \ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 8:b0+b1 */ \ "movdqa %%xmm10, %%xmm3\n\t" \ "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 8:a0*b0 */ \ "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */ \ "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 8:(a0+a1)*(b0+b1) */ \ \ "pshufd $78, %%xmm9, %%xmm11\n\t" \ "pshufd $78, %%xmm2, %%xmm7\n\t" \ "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */ \ "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 7:b0+b1 */ \ "movdqa %%xmm9, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 7:a0*b0 */ \ "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */ \ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */ \ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */ \ \ "movdqu 2*16(%[buf]), %%xmm5\n\t" \ "movdqu 3*16(%[buf]), %%xmm2\n\t" \ be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ \ "pshufd $78, %%xmm8, %%xmm11\n\t" \ "pshufd $78, %%xmm5, %%xmm7\n\t" \ "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */ \ "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 6:b0+b1 */ \ "movdqa %%xmm8, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 6:a0*b0 */ \ "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */ \ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */ \ \ /* Load H3, H4, H5. */ \ "movdqu 3*16(%[h_table]), %%xmm10\n\t" \ "movdqu 2*16(%[h_table]), %%xmm9\n\t" \ "movdqu 1*16(%[h_table]), %%xmm8\n\t" \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */ \ "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */ \ \ "pshufd $78, %%xmm10, %%xmm11\n\t" \ "pshufd $78, %%xmm2, %%xmm7\n\t" \ "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */ \ "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 5:b0+b1 */ \ "movdqa %%xmm10, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 5:a0*b0 */ \ "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */ \ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 5:(a0+a1)*(b0+b1) */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */ \ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */ \ \ "movdqu 4*16(%[buf]), %%xmm5\n\t" \ "movdqu 5*16(%[buf]), %%xmm2\n\t" \ be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ \ "pshufd $78, %%xmm9, %%xmm11\n\t" \ "pshufd $78, %%xmm5, %%xmm7\n\t" \ "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ \ "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ \ "movdqa %%xmm9, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ \ "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ \ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */ \ "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */ \ \ "pshufd $78, %%xmm8, %%xmm11\n\t" \ "pshufd $78, %%xmm2, %%xmm7\n\t" \ "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ \ "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ \ "movdqa %%xmm8, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ \ "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ \ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ \ \ "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */ \ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */ \ \ "movdqu 6*16(%[buf]), %%xmm5\n\t" \ "movdqu 7*16(%[buf]), %%xmm2\n\t" \ be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ \ "pshufd $78, %%xmm8, %%xmm11\n\t" \ "pshufd $78, %%xmm5, %%xmm7\n\t" \ "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 2:a0+a1 */ \ "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ \ "movdqa %%xmm8, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ \ "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ \ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */ \ "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ \ \ "pshufd $78, %%xmm0, %%xmm11\n\t" \ "pshufd $78, %%xmm2, %%xmm7\n\t" \ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */ \ "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ \ "movdqa %%xmm0, %%xmm6\n\t" \ "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ \ "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ \ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ \ \ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4+5+6+7+8:a0*b0 */ \ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4+5+6+7+8:a1*b1 */ \ "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ \ \ /* aggregated reduction... */ \ "movdqa %%xmm3, %%xmm5\n\t" \ "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ \ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \ "movdqa %%xmm4, %%xmm5\n\t" \ "psrldq $8, %%xmm4\n\t" \ "pslldq $8, %%xmm5\n\t" \ "pxor %%xmm5, %%xmm3\n\t" \ "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the \ carry-less multiplication of xmm0 \ by xmm1 */ static ASM_FUNC_ATTR_INLINE void gfmul_pclmul_aggr8(const void *buf, const void *h_table) { /* Input: H¹: XMM0 bemask: XMM15 Hash: XMM1 Output: Hash: XMM1 Inputs XMM0 and XMM15 stays unmodified. */ asm volatile (GFMUL_AGGR8_ASM(be_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table) : "memory" ); reduction(); } static ASM_FUNC_ATTR_INLINE void gfmul_pclmul_aggr8_le(const void *buf, const void *h_table) { /* Input: H¹: XMM0 Hash: XMM1 Output: Hash: XMM1 Inputs XMM0 and XMM15 stays unmodified. */ asm volatile (GFMUL_AGGR8_ASM(le_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table) : "memory" ); reduction(); } #ifdef GCM_USE_INTEL_VPCLMUL_AVX2 #define GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le) \ /* perform clmul and merge results... */ \ "vmovdqu 0*16(%[buf]), %%ymm5\n\t" \ "vmovdqu 2*16(%[buf]), %%ymm2\n\t" \ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" \ \ "vpshufd $78, %%ymm0, %%ymm5\n\t" \ "vpshufd $78, %%ymm1, %%ymm4\n\t" \ "vpxor %%ymm0, %%ymm5, %%ymm5\n\t" /* ymm5 holds 15|16:a0+a1 */ \ "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:b0+b1 */ \ "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t" /* ymm3 holds 15|16:a0*b0 */ \ "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm1 holds 15|16:a1*b1 */ \ "vpclmulqdq $0, %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:(a0+a1)*(b0+b1) */ \ \ "vmovdqu %[h1_h2], %%ymm0\n\t" \ \ "vpshufd $78, %%ymm13, %%ymm14\n\t" \ "vpshufd $78, %%ymm2, %%ymm7\n\t" \ "vpxor %%ymm13, %%ymm14, %%ymm14\n\t" /* ymm14 holds 13|14:a0+a1 */ \ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 13|14:b0+b1 */ \ "vpclmulqdq $0, %%ymm2, %%ymm13, %%ymm6\n\t" /* ymm6 holds 13|14:a0*b0 */ \ "vpclmulqdq $17, %%ymm13, %%ymm2, %%ymm2\n\t" /* ymm2 holds 13|14:a1*b1 */ \ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 13|14:(a0+a1)*(b0+b1) */\ \ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 13+15|14+16:a0*b0 */ \ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 13+15|14+16:a1*b1 */ \ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 13+15|14+16:(a0+a1)*(b0+b1) */ \ \ "vmovdqu 4*16(%[buf]), %%ymm5\n\t" \ "vmovdqu 6*16(%[buf]), %%ymm2\n\t" \ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ \ "vpshufd $78, %%ymm12, %%ymm14\n\t" \ "vpshufd $78, %%ymm5, %%ymm7\n\t" \ "vpxor %%ymm12, %%ymm14, %%ymm14\n\t" /* ymm14 holds 11|12:a0+a1 */ \ "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 11|12:b0+b1 */ \ "vpclmulqdq $0, %%ymm5, %%ymm12, %%ymm6\n\t" /* ymm6 holds 11|12:a0*b0 */ \ "vpclmulqdq $17, %%ymm12, %%ymm5, %%ymm5\n\t" /* ymm5 holds 11|12:a1*b1 */ \ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 11|12:(a0+a1)*(b0+b1) */\ \ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 11+13+15|12+14+16:a0*b0 */ \ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 11+13+15|12+14+16:a1*b1 */ \ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 11+13+15|12+14+16:(a0+a1)*(b0+b1) */\ \ "vpshufd $78, %%ymm11, %%ymm14\n\t" \ "vpshufd $78, %%ymm2, %%ymm7\n\t" \ "vpxor %%ymm11, %%ymm14, %%ymm14\n\t" /* ymm14 holds 9|10:a0+a1 */ \ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:b0+b1 */ \ "vpclmulqdq $0, %%ymm2, %%ymm11, %%ymm6\n\t" /* ymm6 holds 9|10:a0*b0 */ \ "vpclmulqdq $17, %%ymm11, %%ymm2, %%ymm2\n\t" /* ymm2 holds 9|10:a1*b1 */ \ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:(a0+a1)*(b0+b1) */ \ \ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 9+11+…+15|10+12+…+16:a0*b0 */ \ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 9+11+…+15|10+12+…+16:a1*b1 */ \ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 9+11+…+15|10+12+…+16:(a0+a1)*(b0+b1) */\ \ "vmovdqu 8*16(%[buf]), %%ymm5\n\t" \ "vmovdqu 10*16(%[buf]), %%ymm2\n\t" \ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ \ "vpshufd $78, %%ymm10, %%ymm14\n\t" \ "vpshufd $78, %%ymm5, %%ymm7\n\t" \ "vpxor %%ymm10, %%ymm14, %%ymm14\n\t" /* ymm14 holds 7|8:a0+a1 */ \ "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:b0+b1 */ \ "vpclmulqdq $0, %%ymm5, %%ymm10, %%ymm6\n\t" /* ymm6 holds 7|8:a0*b0 */ \ "vpclmulqdq $17, %%ymm10, %%ymm5, %%ymm5\n\t" /* ymm5 holds 7|8:a1*b1 */ \ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:(a0+a1)*(b0+b1) */ \ \ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 7+9+…+15|8+10+…+16:a0*b0 */ \ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 7+9+…+15|8+10+…+16:a1*b1 */ \ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 7+9+…+15|8+10+…+16:(a0+a1)*(b0+b1) */\ \ "vpshufd $78, %%ymm9, %%ymm14\n\t" \ "vpshufd $78, %%ymm2, %%ymm7\n\t" \ "vpxor %%ymm9, %%ymm14, %%ymm14\n\t" /* ymm14 holds 5|6:a0+a1 */ \ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:b0+b1 */ \ "vpclmulqdq $0, %%ymm2, %%ymm9, %%ymm6\n\t" /* ymm6 holds 5|6:a0*b0 */ \ "vpclmulqdq $17, %%ymm9, %%ymm2, %%ymm2\n\t" /* ymm2 holds 5|6:a1*b1 */ \ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:(a0+a1)*(b0+b1) */ \ \ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 5+7+…+15|6+8+…+16:a0*b0 */ \ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 5+7+…+15|6+8+…+16:a1*b1 */ \ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 5+7+…+15|6+8+…+16:(a0+a1)*(b0+b1) */\ \ "vmovdqu 12*16(%[buf]), %%ymm5\n\t" \ "vmovdqu 14*16(%[buf]), %%ymm2\n\t" \ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \ \ "vpshufd $78, %%ymm8, %%ymm14\n\t" \ "vpshufd $78, %%ymm5, %%ymm7\n\t" \ "vpxor %%ymm8, %%ymm14, %%ymm14\n\t" /* ymm14 holds 3|4:a0+a1 */ \ "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:b0+b1 */ \ "vpclmulqdq $0, %%ymm5, %%ymm8, %%ymm6\n\t" /* ymm6 holds 3|4:a0*b0 */ \ "vpclmulqdq $17, %%ymm8, %%ymm5, %%ymm5\n\t" /* ymm5 holds 3|4:a1*b1 */ \ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:(a0+a1)*(b0+b1) */ \ \ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 3+5+…+15|4+6+…+16:a0*b0 */ \ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 3+5+…+15|4+6+…+16:a1*b1 */ \ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 3+5+…+15|4+6+…+16:(a0+a1)*(b0+b1) */\ \ "vpshufd $78, %%ymm0, %%ymm14\n\t" \ "vpshufd $78, %%ymm2, %%ymm7\n\t" \ "vpxor %%ymm0, %%ymm14, %%ymm14\n\t" /* ymm14 holds 1|2:a0+a1 */ \ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:b0+b1 */ \ "vpclmulqdq $0, %%ymm2, %%ymm0, %%ymm6\n\t" /* ymm6 holds 1|2:a0*b0 */ \ "vpclmulqdq $17, %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds 1|2:a1*b1 */ \ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:(a0+a1)*(b0+b1) */ \ \ "vmovdqu %[h15_h16], %%ymm0\n\t" \ \ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 1+3+…+15|2+4+…+16:a0*b0 */ \ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 1+3+…+15|2+4+…+16:a1*b1 */ \ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 1+3+…+15|2+4+…+16:(a0+a1)*(b0+b1) */\ \ /* aggregated reduction... */ \ "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */ \ "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \ "vpslldq $8, %%ymm4, %%ymm5\n\t" \ "vpsrldq $8, %%ymm4, %%ymm4\n\t" \ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" \ "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* holds the result of the \ carry-less multiplication of ymm0 \ by ymm1 */ \ \ /* first phase of the reduction */ \ "vpsllq $1, %%ymm3, %%ymm6\n\t" /* packed right shifting << 63 */ \ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \ "vpsllq $57, %%ymm3, %%ymm5\n\t" /* packed right shifting << 57 */ \ "vpsllq $62, %%ymm6, %%ymm6\n\t" /* packed right shifting << 62 */ \ "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */ \ "vpshufd $0x6a, %%ymm6, %%ymm5\n\t" \ "vpshufd $0xae, %%ymm6, %%ymm6\n\t" \ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */ \ \ /* second phase of the reduction */ \ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */ \ "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 1 */ \ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \ "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 2 */ \ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" \ "vpsrlq $5, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 7 */ \ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \ "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */ \ \ /* merge 128-bit halves */ \ "vextracti128 $1, %%ymm1, %%xmm2\n\t" \ "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" static ASM_FUNC_ATTR_INLINE void gfmul_vpclmul_avx2_aggr16(const void *buf, const void *h_table, const u64 *h1_h2_h15_h16) { /* Input: Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 bemask: YMM15 Hash: XMM1 Output: Hash: XMM1 Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay unmodified. */ asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table), [h1_h2] "m" (h1_h2_h15_h16[0]), [h15_h16] "m" (h1_h2_h15_h16[4]) : "memory" ); } static ASM_FUNC_ATTR_INLINE void gfmul_vpclmul_avx2_aggr16_le(const void *buf, const void *h_table, const u64 *h1_h2_h15_h16) { /* Input: Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 bemask: YMM15 Hash: XMM1 Output: Hash: XMM1 Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay unmodified. */ asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(le_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table), [h1_h2] "m" (h1_h2_h15_h16[0]), [h15_h16] "m" (h1_h2_h15_h16[4]) : "memory" ); } static ASM_FUNC_ATTR_INLINE void gfmul_pclmul_avx2(void) { /* Input: YMM0 and YMM1, Output: YMM1. Input YMM0 stays unmodified. Input must be converted to little-endian. */ asm volatile (/* gfmul, ymm0 has operator a and ymm1 has operator b. */ "vpshufd $78, %%ymm0, %%ymm2\n\t" "vpshufd $78, %%ymm1, %%ymm4\n\t" "vpxor %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds a0+a1 */ "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds b0+b1 */ "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t" /* ymm3 holds a0*b0 */ "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm6 holds a1*b1 */ "vpclmulqdq $0, %%ymm2, %%ymm4, %%ymm4\n\t" /* ymm4 holds (a0+a1)*(b0+b1) */ "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */ "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ "vpslldq $8, %%ymm4, %%ymm5\n\t" "vpsrldq $8, %%ymm4, %%ymm4\n\t" "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* holds the result of the carry-less multiplication of ymm0 by ymm1 */ /* first phase of the reduction */ "vpsllq $1, %%ymm3, %%ymm6\n\t" /* packed right shifting << 63 */ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" "vpsllq $57, %%ymm3, %%ymm5\n\t" /* packed right shifting << 57 */ "vpsllq $62, %%ymm6, %%ymm6\n\t" /* packed right shifting << 62 */ "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */ "vpshufd $0x6a, %%ymm6, %%ymm5\n\t" "vpshufd $0xae, %%ymm6, %%ymm6\n\t" "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */ /* second phase of the reduction */ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */ "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 1 */ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 2 */ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" "vpsrlq $5, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 7 */ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */ ::: "memory" ); } static ASM_FUNC_ATTR_INLINE void gcm_lsh_avx2(void *h, unsigned int hoffs) { static const u64 pconst[4] __attribute__ ((aligned (32))) = { U64_C(0x0000000000000001), U64_C(0xc200000000000000), U64_C(0x0000000000000001), U64_C(0xc200000000000000) }; asm volatile ("vmovdqu %[h], %%ymm2\n\t" "vpshufd $0xff, %%ymm2, %%ymm3\n\t" "vpsrad $31, %%ymm3, %%ymm3\n\t" "vpslldq $8, %%ymm2, %%ymm4\n\t" "vpand %[pconst], %%ymm3, %%ymm3\n\t" "vpaddq %%ymm2, %%ymm2, %%ymm2\n\t" "vpsrlq $63, %%ymm4, %%ymm4\n\t" "vpxor %%ymm3, %%ymm2, %%ymm2\n\t" "vpxor %%ymm4, %%ymm2, %%ymm2\n\t" "vmovdqu %%ymm2, %[h]\n\t" : [h] "+m" (*((byte *)h + hoffs)) : [pconst] "m" (*pconst) : "memory" ); } static ASM_FUNC_ATTR_INLINE void load_h1h2_to_ymm1(gcry_cipher_hd_t c) { unsigned int key_pos = offsetof(struct gcry_cipher_handle, u_mode.gcm.u_ghash_key.key); unsigned int table_pos = offsetof(struct gcry_cipher_handle, u_mode.gcm.gcm_table); if (key_pos + 16 == table_pos) { /* Optimization: Table follows immediately after key. */ asm volatile ("vmovdqu %[key], %%ymm1\n\t" : : [key] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory"); } else { asm volatile ("vmovdqa %[key], %%xmm1\n\t" "vinserti128 $1, 0*16(%[h_table]), %%ymm1, %%ymm1\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table), [key] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory"); } } static ASM_FUNC_ATTR void ghash_setup_aggr8_avx2(gcry_cipher_hd_t c) { c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED; asm volatile (/* load H⁴ */ "vbroadcasti128 3*16(%[h_table]), %%ymm0\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); /* load H <<< 1, H² <<< 1 */ load_h1h2_to_ymm1 (c); gfmul_pclmul_avx2 (); /* H<<<1•H⁴ => H⁵, H²<<<1•H⁴ => H⁶ */ asm volatile ("vmovdqu %%ymm1, 3*16(%[h_table])\n\t" /* load H³ <<< 1, H⁴ <<< 1 */ "vmovdqu 1*16(%[h_table]), %%ymm1\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul_avx2 (); /* H³<<<1•H⁴ => H⁷, H⁴<<<1•H⁴ => H⁸ */ asm volatile ("vmovdqu %%ymm1, 6*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */ "vmovdqu %%ymm1, 5*16(%[h_table])\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1, H⁶ <<< 1 */ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1, H⁸ <<< 1 */ } static ASM_FUNC_ATTR void ghash_setup_aggr16_avx2(gcry_cipher_hd_t c) { c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR16_TABLE_INITIALIZED; asm volatile (/* load H⁸ */ "vbroadcasti128 7*16(%[h_table]), %%ymm0\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); /* load H <<< 1, H² <<< 1 */ load_h1h2_to_ymm1 (c); gfmul_pclmul_avx2 (); /* H<<<1•H⁸ => H⁹, H²<<<1•H⁸ => H¹⁰ */ asm volatile ("vmovdqu %%ymm1, 7*16(%[h_table])\n\t" /* load H³ <<< 1, H⁴ <<< 1 */ "vmovdqu 1*16(%[h_table]), %%ymm1\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul_avx2 (); /* H³<<<1•H⁸ => H¹¹, H⁴<<<1•H⁸ => H¹² */ asm volatile ("vmovdqu %%ymm1, 9*16(%[h_table])\n\t" /* load H⁵ <<< 1, H⁶ <<< 1 */ "vmovdqu 3*16(%[h_table]), %%ymm1\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul_avx2 (); /* H⁵<<<1•H⁸ => H¹³, H⁶<<<1•H⁸ => H¹⁴ */ asm volatile ("vmovdqu %%ymm1, 11*16(%[h_table])\n\t" /* load H⁷ <<< 1, H⁸ <<< 1 */ "vmovdqu 5*16(%[h_table]), %%ymm1\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul_avx2 (); /* H⁷<<<1•H⁸ => H¹⁵, H⁸<<<1•H⁸ => H¹⁶ */ - asm volatile ("vmovdqu %%ymm1, 13*16(%[h_table])\n\t" + asm volatile ("vmovdqu %%ymm1, 14*16(%[h_table])\n\t" /* store H¹⁶ for aggr32 setup */ + "vmovdqu %%ymm1, 13*16(%[h_table])\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 7 * 16); /* H⁹ <<< 1, H¹⁰ <<< 1 */ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 9 * 16); /* H¹¹ <<< 1, H¹² <<< 1 */ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 11 * 16); /* H¹³ <<< 1, H¹⁴ <<< 1 */ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 13 * 16); /* H¹⁵ <<< 1, H¹⁶ <<< 1 */ } #endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ + +#ifdef GCM_USE_INTEL_VPCLMUL_AVX512 + +#define GFMUL_AGGR32_ASM_VPCMUL_AVX512(be_to_le) \ + /* perform clmul and merge results... */ \ + "vmovdqu64 0*16(%[buf]), %%zmm5\n\t" \ + "vmovdqu64 4*16(%[buf]), %%zmm2\n\t" \ + be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */ \ + "vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" \ + \ + "vpshufd $78, %%zmm0, %%zmm5\n\t" \ + "vpshufd $78, %%zmm1, %%zmm4\n\t" \ + "vpxorq %%zmm0, %%zmm5, %%zmm5\n\t" /* zmm5 holds 29|…|32:a0+a1 */ \ + "vpxorq %%zmm1, %%zmm4, %%zmm4\n\t" /* zmm4 holds 29|…|32:b0+b1 */ \ + "vpclmulqdq $0, %%zmm1, %%zmm0, %%zmm3\n\t" /* zmm3 holds 29|…|32:a0*b0 */ \ + "vpclmulqdq $17, %%zmm0, %%zmm1, %%zmm1\n\t" /* zmm1 holds 29|…|32:a1*b1 */ \ + "vpclmulqdq $0, %%zmm5, %%zmm4, %%zmm4\n\t" /* zmm4 holds 29|…|32:(a0+a1)*(b0+b1) */ \ + \ + "vpshufd $78, %%zmm13, %%zmm14\n\t" \ + "vpshufd $78, %%zmm2, %%zmm7\n\t" \ + "vpxorq %%zmm13, %%zmm14, %%zmm14\n\t" /* zmm14 holds 25|…|28:a0+a1 */ \ + "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t" /* zmm7 holds 25|…|28:b0+b1 */ \ + "vpclmulqdq $0, %%zmm2, %%zmm13, %%zmm17\n\t" /* zmm17 holds 25|…|28:a0*b0 */ \ + "vpclmulqdq $17, %%zmm13, %%zmm2, %%zmm18\n\t" /* zmm18 holds 25|…|28:a1*b1 */ \ + "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm19\n\t" /* zmm19 holds 25|…|28:(a0+a1)*(b0+b1) */\ + \ + "vmovdqu64 8*16(%[buf]), %%zmm5\n\t" \ + "vmovdqu64 12*16(%[buf]), %%zmm2\n\t" \ + be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */ \ + \ + "vpshufd $78, %%zmm12, %%zmm14\n\t" \ + "vpshufd $78, %%zmm5, %%zmm7\n\t" \ + "vpxorq %%zmm12, %%zmm14, %%zmm14\n\t" /* zmm14 holds 21|…|24:a0+a1 */ \ + "vpxorq %%zmm5, %%zmm7, %%zmm7\n\t" /* zmm7 holds 21|…|24:b0+b1 */ \ + "vpclmulqdq $0, %%zmm5, %%zmm12, %%zmm6\n\t" /* zmm6 holds 21|…|24:a0*b0 */ \ + "vpclmulqdq $17, %%zmm12, %%zmm5, %%zmm5\n\t" /* zmm5 holds 21|…|24:a1*b1 */ \ + "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t" /* zmm7 holds 21|…|24:(a0+a1)*(b0+b1) */\ + \ + "vpternlogq $0x96, %%zmm6, %%zmm17, %%zmm3\n\t" /* zmm3 holds 21+…|…|…+32:a0*b0 */ \ + "vpternlogq $0x96, %%zmm5, %%zmm18, %%zmm1\n\t" /* zmm1 holds 21+…|…|…+32:a1*b1 */ \ + "vpternlogq $0x96, %%zmm7, %%zmm19, %%zmm4\n\t" /* zmm4 holds 21+…|…|…+32:(a0+a1)*(b0+b1) */\ + \ + "vpshufd $78, %%zmm11, %%zmm14\n\t" \ + "vpshufd $78, %%zmm2, %%zmm7\n\t" \ + "vpxorq %%zmm11, %%zmm14, %%zmm14\n\t" /* zmm14 holds 17|…|20:a0+a1 */ \ + "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t" /* zmm7 holds 17|…|20:b0+b1 */ \ + "vpclmulqdq $0, %%zmm2, %%zmm11, %%zmm17\n\t" /* zmm17 holds 17|…|20:a0*b0 */ \ + "vpclmulqdq $17, %%zmm11, %%zmm2, %%zmm18\n\t" /* zmm18 holds 17|…|20:a1*b1 */ \ + "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm19\n\t" /* zmm19 holds 17|…|20:(a0+a1)*(b0+b1) */\ + \ + "vmovdqu64 16*16(%[buf]), %%zmm5\n\t" \ + "vmovdqu64 20*16(%[buf]), %%zmm2\n\t" \ + be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */ \ + \ + "vpshufd $78, %%zmm10, %%zmm14\n\t" \ + "vpshufd $78, %%zmm5, %%zmm7\n\t" \ + "vpxorq %%zmm10, %%zmm14, %%zmm14\n\t" /* zmm14 holds 13|…|16:a0+a1 */ \ + "vpxorq %%zmm5, %%zmm7, %%zmm7\n\t" /* zmm7 holds 13|…|16:b0+b1 */ \ + "vpclmulqdq $0, %%zmm5, %%zmm10, %%zmm6\n\t" /* zmm6 holds 13|…|16:a0*b0 */ \ + "vpclmulqdq $17, %%zmm10, %%zmm5, %%zmm5\n\t" /* zmm5 holds 13|…|16:a1*b1 */ \ + "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t" /* zmm7 holds 13|…|16:(a0+a1)*(b0+b1) */ \ + \ + "vpternlogq $0x96, %%zmm6, %%zmm17, %%zmm3\n\t" /* zmm3 holds 13+…|…|…+32:a0*b0 */ \ + "vpternlogq $0x96, %%zmm5, %%zmm18, %%zmm1\n\t" /* zmm1 holds 13+…|…|…+32:a1*b1 */ \ + "vpternlogq $0x96, %%zmm7, %%zmm19, %%zmm4\n\t" /* zmm4 holds 13+…|…|…+32:(a0+a1)*(b0+b1) */\ + \ + "vpshufd $78, %%zmm9, %%zmm14\n\t" \ + "vpshufd $78, %%zmm2, %%zmm7\n\t" \ + "vpxorq %%zmm9, %%zmm14, %%zmm14\n\t" /* zmm14 holds 9|…|12:a0+a1 */ \ + "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t" /* zmm7 holds 9|…|12:b0+b1 */ \ + "vpclmulqdq $0, %%zmm2, %%zmm9, %%zmm17\n\t" /* zmm17 holds 9|…|12:a0*b0 */ \ + "vpclmulqdq $17, %%zmm9, %%zmm2, %%zmm18\n\t" /* zmm18 holds 9|…|12:a1*b1 */ \ + "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm19\n\t" /* zmm19 holds 9|…|12:(a0+a1)*(b0+b1) */\ + \ + "vmovdqu64 24*16(%[buf]), %%zmm5\n\t" \ + "vmovdqu64 28*16(%[buf]), %%zmm2\n\t" \ + be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */ \ + be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */ \ + \ + "vpshufd $78, %%zmm8, %%zmm14\n\t" \ + "vpshufd $78, %%zmm5, %%zmm7\n\t" \ + "vpxorq %%zmm8, %%zmm14, %%zmm14\n\t" /* zmm14 holds 5|…|8:a0+a1 */ \ + "vpxorq %%zmm5, %%zmm7, %%zmm7\n\t" /* zmm7 holds 5|…|8:b0+b1 */ \ + "vpclmulqdq $0, %%zmm5, %%zmm8, %%zmm6\n\t" /* zmm6 holds 5|…|8:a0*b0 */ \ + "vpclmulqdq $17, %%zmm8, %%zmm5, %%zmm5\n\t" /* zmm5 holds 5|…|8:a1*b1 */ \ + "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t" /* zmm7 holds 5|…|8:(a0+a1)*(b0+b1) */ \ + \ + "vpternlogq $0x96, %%zmm6, %%zmm17, %%zmm3\n\t" /* zmm3 holds 5+…|…|…+32:a0*b0 */ \ + "vpternlogq $0x96, %%zmm5, %%zmm18, %%zmm1\n\t" /* zmm1 holds 5+…|…|…+32:a1*b1 */ \ + "vpternlogq $0x96, %%zmm7, %%zmm19, %%zmm4\n\t" /* zmm4 holds 5+…|…|…+32:(a0+a1)*(b0+b1) */\ + \ + "vpshufd $78, %%zmm16, %%zmm14\n\t" \ + "vpshufd $78, %%zmm2, %%zmm7\n\t" \ + "vpxorq %%zmm16, %%zmm14, %%zmm14\n\t" /* zmm14 holds 1|…|4:a0+a1 */ \ + "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t" /* zmm7 holds 1|2:b0+b1 */ \ + "vpclmulqdq $0, %%zmm2, %%zmm16, %%zmm6\n\t" /* zmm6 holds 1|2:a0*b0 */ \ + "vpclmulqdq $17, %%zmm16, %%zmm2, %%zmm2\n\t" /* zmm2 holds 1|2:a1*b1 */ \ + "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t" /* zmm7 holds 1|2:(a0+a1)*(b0+b1) */ \ + \ + "vpxorq %%zmm6, %%zmm3, %%zmm3\n\t" /* zmm3 holds 1+3+…+15|2+4+…+16:a0*b0 */ \ + "vpxorq %%zmm2, %%zmm1, %%zmm1\n\t" /* zmm1 holds 1+3+…+15|2+4+…+16:a1*b1 */ \ + "vpxorq %%zmm7, %%zmm4, %%zmm4\n\t" /* zmm4 holds 1+3+…+15|2+4+…+16:(a0+a1)*(b0+b1) */\ + \ + /* aggregated reduction... */ \ + "vpternlogq $0x96, %%zmm1, %%zmm3, %%zmm4\n\t" /* zmm4 holds \ + * a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \ + "vpslldq $8, %%zmm4, %%zmm5\n\t" \ + "vpsrldq $8, %%zmm4, %%zmm4\n\t" \ + "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t" \ + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" /* holds the result of the \ + carry-less multiplication of zmm0 \ + by zmm1 */ \ + \ + /* first phase of the reduction */ \ + "vpsllq $1, %%zmm3, %%zmm6\n\t" /* packed right shifting << 63 */ \ + "vpxorq %%zmm3, %%zmm6, %%zmm6\n\t" \ + "vpsllq $57, %%zmm3, %%zmm5\n\t" /* packed right shifting << 57 */ \ + "vpsllq $62, %%zmm6, %%zmm6\n\t" /* packed right shifting << 62 */ \ + "vpxorq %%zmm5, %%zmm6, %%zmm6\n\t" /* xor the shifted versions */ \ + "vpshufd $0x6a, %%zmm6, %%zmm5\n\t" \ + "vpshufd $0xae, %%zmm6, %%zmm6\n\t" \ + "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t" /* first phase of the reduction complete */ \ + \ + /* second phase of the reduction */ \ + "vpsrlq $1, %%zmm3, %%zmm2\n\t" /* packed left shifting >> 1 */ \ + "vpsrlq $2, %%zmm3, %%zmm4\n\t" /* packed left shifting >> 2 */ \ + "vpsrlq $7, %%zmm3, %%zmm5\n\t" /* packed left shifting >> 7 */ \ + "vpternlogq $0x96, %%zmm3, %%zmm2, %%zmm1\n\t" /* xor the shifted versions */ \ + "vpternlogq $0x96, %%zmm4, %%zmm5, %%zmm6\n\t" \ + "vpxorq %%zmm6, %%zmm1, %%zmm1\n\t" /* the result is in zmm1 */ \ + \ + /* merge 256-bit halves */ \ + "vextracti64x4 $1, %%zmm1, %%ymm2\n\t" \ + "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" \ + /* merge 128-bit halves */ \ + "vextracti128 $1, %%ymm1, %%xmm2\n\t" \ + "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" + +static ASM_FUNC_ATTR_INLINE void +gfmul_vpclmul_avx512_aggr32(const void *buf, const void *h_table) +{ + /* Input: + Hx: ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16 + bemask: ZMM15 + Hash: XMM1 + Output: + Hash: XMM1 + Inputs ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16 and YMM15 stay + unmodified. + */ + asm volatile (GFMUL_AGGR32_ASM_VPCMUL_AVX512(be_to_le) + : + : [buf] "r" (buf), + [h_table] "r" (h_table) + : "memory" ); +} + +static ASM_FUNC_ATTR_INLINE void +gfmul_vpclmul_avx512_aggr32_le(const void *buf, const void *h_table) +{ + /* Input: + Hx: ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16 + bemask: ZMM15 + Hash: XMM1 + Output: + Hash: XMM1 + Inputs ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16 and YMM15 stay + unmodified. + */ + asm volatile (GFMUL_AGGR32_ASM_VPCMUL_AVX512(le_to_le) + : + : [buf] "r" (buf), + [h_table] "r" (h_table) + : "memory" ); +} + +static ASM_FUNC_ATTR_INLINE +void gfmul_pclmul_avx512(void) +{ + /* Input: ZMM0 and ZMM1, Output: ZMM1. Input ZMM0 stays unmodified. + Input must be converted to little-endian. + */ + asm volatile (/* gfmul, zmm0 has operator a and zmm1 has operator b. */ + "vpshufd $78, %%zmm0, %%zmm2\n\t" + "vpshufd $78, %%zmm1, %%zmm4\n\t" + "vpxorq %%zmm0, %%zmm2, %%zmm2\n\t" /* zmm2 holds a0+a1 */ + "vpxorq %%zmm1, %%zmm4, %%zmm4\n\t" /* zmm4 holds b0+b1 */ + + "vpclmulqdq $0, %%zmm1, %%zmm0, %%zmm3\n\t" /* zmm3 holds a0*b0 */ + "vpclmulqdq $17, %%zmm0, %%zmm1, %%zmm1\n\t" /* zmm6 holds a1*b1 */ + "vpclmulqdq $0, %%zmm2, %%zmm4, %%zmm4\n\t" /* zmm4 holds (a0+a1)*(b0+b1) */ + + "vpternlogq $0x96, %%zmm1, %%zmm3, %%zmm4\n\t" /* zmm4 holds + * a0*b0+a1*b1+(a0+a1)*(b0+b1) */ + "vpslldq $8, %%zmm4, %%zmm5\n\t" + "vpsrldq $8, %%zmm4, %%zmm4\n\t" + "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" /* holds the result of the + carry-less multiplication of zmm0 + by zmm1 */ + + /* first phase of the reduction */ + "vpsllq $1, %%zmm3, %%zmm6\n\t" /* packed right shifting << 63 */ + "vpxorq %%zmm3, %%zmm6, %%zmm6\n\t" + "vpsllq $57, %%zmm3, %%zmm5\n\t" /* packed right shifting << 57 */ + "vpsllq $62, %%zmm6, %%zmm6\n\t" /* packed right shifting << 62 */ + "vpxorq %%zmm5, %%zmm6, %%zmm6\n\t" /* xor the shifted versions */ + "vpshufd $0x6a, %%zmm6, %%zmm5\n\t" + "vpshufd $0xae, %%zmm6, %%zmm6\n\t" + "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t" /* first phase of the reduction complete */ + + /* second phase of the reduction */ + "vpsrlq $1, %%zmm3, %%zmm2\n\t" /* packed left shifting >> 1 */ + "vpsrlq $2, %%zmm3, %%zmm4\n\t" /* packed left shifting >> 2 */ + "vpsrlq $7, %%zmm3, %%zmm5\n\t" /* packed left shifting >> 7 */ + "vpternlogq $0x96, %%zmm3, %%zmm2, %%zmm1\n\t" /* xor the shifted versions */ + "vpternlogq $0x96, %%zmm4, %%zmm5, %%zmm6\n\t" + "vpxorq %%zmm6, %%zmm1, %%zmm1\n\t" /* the result is in zmm1 */ + ::: "memory" ); +} + +static ASM_FUNC_ATTR_INLINE void +gcm_lsh_avx512(void *h, unsigned int hoffs) +{ + static const u64 pconst[8] __attribute__ ((aligned (64))) = + { + U64_C(0x0000000000000001), U64_C(0xc200000000000000), + U64_C(0x0000000000000001), U64_C(0xc200000000000000), + U64_C(0x0000000000000001), U64_C(0xc200000000000000), + U64_C(0x0000000000000001), U64_C(0xc200000000000000) + }; + + asm volatile ("vmovdqu64 %[h], %%zmm2\n\t" + "vpshufd $0xff, %%zmm2, %%zmm3\n\t" + "vpsrad $31, %%zmm3, %%zmm3\n\t" + "vpslldq $8, %%zmm2, %%zmm4\n\t" + "vpandq %[pconst], %%zmm3, %%zmm3\n\t" + "vpaddq %%zmm2, %%zmm2, %%zmm2\n\t" + "vpsrlq $63, %%zmm4, %%zmm4\n\t" + "vpternlogq $0x96, %%zmm4, %%zmm3, %%zmm2\n\t" + "vmovdqu64 %%zmm2, %[h]\n\t" + : [h] "+m" (*((byte *)h + hoffs)) + : [pconst] "m" (*pconst) + : "memory" ); +} + +static ASM_FUNC_ATTR_INLINE void +load_h1h4_to_zmm1(gcry_cipher_hd_t c) +{ + unsigned int key_pos = + offsetof(struct gcry_cipher_handle, u_mode.gcm.u_ghash_key.key); + unsigned int table_pos = + offsetof(struct gcry_cipher_handle, u_mode.gcm.gcm_table); + + if (key_pos + 16 == table_pos) + { + /* Optimization: Table follows immediately after key. */ + asm volatile ("vmovdqu64 %[key], %%zmm1\n\t" + : + : [key] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory"); + } + else + { + asm volatile ("vmovdqu64 -1*16(%[h_table]), %%zmm1\n\t" + "vinserti64x2 $0, %[key], %%zmm1, %%zmm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table), + [key] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory"); + } +} + +static ASM_FUNC_ATTR void +ghash_setup_aggr8_avx512(gcry_cipher_hd_t c) +{ + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED; + + asm volatile (/* load H⁴ */ + "vbroadcasti64x2 3*16(%[h_table]), %%zmm0\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + /* load H <<< 1, H² <<< 1, H³ <<< 1, H⁴ <<< 1 */ + load_h1h4_to_zmm1 (c); + + gfmul_pclmul_avx512 (); /* H<<<1•H⁴ => H⁵, …, H⁴<<<1•H⁴ => H⁸ */ + + asm volatile ("vmovdqu64 %%zmm1, 4*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */ + "vmovdqu64 %%zmm1, 3*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1, …, H⁸ <<< 1 */ +} + +static ASM_FUNC_ATTR void +ghash_setup_aggr16_avx512(gcry_cipher_hd_t c) +{ + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR16_TABLE_INITIALIZED; + + asm volatile (/* load H⁸ */ + "vbroadcasti64x2 7*16(%[h_table]), %%zmm0\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + /* load H <<< 1, H² <<< 1, H³ <<< 1, H⁴ <<< 1 */ + load_h1h4_to_zmm1 (c); + + gfmul_pclmul_avx512 (); /* H<<<1•H⁸ => H⁹, … , H⁴<<<1•H⁸ => H¹² */ + + asm volatile ("vmovdqu64 %%zmm1, 7*16(%[h_table])\n\t" + /* load H⁵ <<< 1, …, H⁸ <<< 1 */ + "vmovdqu64 3*16(%[h_table]), %%zmm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx512 (); /* H⁵<<<1•H⁸ => H¹¹, … , H⁸<<<1•H⁸ => H¹⁶ */ + + asm volatile ("vmovdqu64 %%zmm1, 12*16(%[h_table])\n\t" /* store H¹⁶ for aggr32 setup */ + "vmovdqu64 %%zmm1, 11*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 7 * 16); /* H⁹ <<< 1, …, H¹² <<< 1 */ + gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 11 * 16); /* H¹³ <<< 1, …, H¹⁶ <<< 1 */ +} + +static ASM_FUNC_ATTR void +ghash_setup_aggr32_avx512(gcry_cipher_hd_t c) +{ + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR32_TABLE_INITIALIZED; + + asm volatile (/* load H¹⁶ */ + "vbroadcasti64x2 15*16(%[h_table]), %%zmm0\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + /* load H <<< 1, H² <<< 1, H³ <<< 1, H⁴ <<< 1 */ + load_h1h4_to_zmm1 (c); + + gfmul_pclmul_avx512 (); /* H<<<1•H¹⁶ => H¹⁷, …, H⁴<<<1•H¹⁶ => H²⁰ */ + + asm volatile ("vmovdqu64 %%zmm1, 15*16(%[h_table])\n\t" + /* load H⁵ <<< 1, …, H⁸ <<< 1 */ + "vmovdqu64 3*16(%[h_table]), %%zmm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx512 (); /* H⁵<<<1•H¹⁶ => H²¹, …, H⁹<<<1•H¹⁶ => H²⁴ */ + + asm volatile ("vmovdqu64 %%zmm1, 19*16(%[h_table])\n\t" + /* load H⁹ <<< 1, …, H¹² <<< 1 */ + "vmovdqu64 7*16(%[h_table]), %%zmm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx512 (); /* H⁹<<<1•H¹⁶ => H²⁵, …, H¹²<<<1•H¹⁶ => H²⁸ */ + + asm volatile ("vmovdqu64 %%zmm1, 23*16(%[h_table])\n\t" + /* load H¹³ <<< 1, …, H¹⁶ <<< 1 */ + "vmovdqu64 11*16(%[h_table]), %%zmm1\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul_avx512 (); /* H¹³<<<1•H¹⁶ => H²⁹, …, H¹⁶<<<1•H¹⁶ => H³² */ + + asm volatile ("vmovdqu64 %%zmm1, 27*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 15 * 16); + gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 19 * 16); + gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 23 * 16); + gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 27 * 16); +} + +static const u64 swap128b_perm[8] __attribute__ ((aligned (64))) = + { + /* For swapping order of 128bit lanes in 512bit register using vpermq. */ + 6, 7, 4, 5, 2, 3, 0, 1 + }; + +#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */ #endif /* __x86_64__ */ static unsigned int ASM_FUNC_ATTR _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks); static unsigned int ASM_FUNC_ATTR _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks); static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs) { static const u64 pconst[2] __attribute__ ((aligned (16))) = { U64_C(0x0000000000000001), U64_C(0xc200000000000000) }; asm volatile ("movdqu %[h], %%xmm2\n\t" "pshufd $0xff, %%xmm2, %%xmm3\n\t" "movdqa %%xmm2, %%xmm4\n\t" "psrad $31, %%xmm3\n\t" "pslldq $8, %%xmm4\n\t" "pand %[pconst], %%xmm3\n\t" "paddq %%xmm2, %%xmm2\n\t" "psrlq $63, %%xmm4\n\t" "pxor %%xmm3, %%xmm2\n\t" "pxor %%xmm4, %%xmm2\n\t" "movdqu %%xmm2, %[h]\n\t" : [h] "+m" (*((byte *)h + hoffs)) : [pconst] "m" (*pconst) : "memory" ); } void ASM_FUNC_ATTR _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, unsigned int hw_features) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; #if defined(__x86_64__) && defined(__WIN64__) char win64tmp[10 * 16]; /* XMM6-XMM15 need to be restored after use. */ asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" "movdqu %%xmm7, 1*16(%0)\n\t" "movdqu %%xmm8, 2*16(%0)\n\t" "movdqu %%xmm9, 3*16(%0)\n\t" "movdqu %%xmm10, 4*16(%0)\n\t" "movdqu %%xmm11, 5*16(%0)\n\t" "movdqu %%xmm12, 6*16(%0)\n\t" "movdqu %%xmm13, 7*16(%0)\n\t" "movdqu %%xmm14, 8*16(%0)\n\t" "movdqu %%xmm15, 9*16(%0)\n\t" : : "r" (win64tmp) : "memory" ); #endif (void)hw_features; c->u_mode.gcm.hw_impl_flags = 0; c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul; c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul; /* Swap endianness of hsub. */ asm volatile ("movdqu (%[key]), %%xmm0\n\t" "pshufb %[be_mask], %%xmm0\n\t" "movdqu %%xmm0, (%[key])\n\t" : : [key] "r" (c->u_mode.gcm.u_ghash_key.key), [be_mask] "m" (*be_mask) : "memory"); gcm_lsh (c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */ asm volatile ("movdqa %%xmm0, %%xmm1\n\t" "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */ : : [key] "r" (c->u_mode.gcm.u_ghash_key.key) : "memory"); gfmul_pclmul (); /* H<<<1•H => H² */ asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gcm_lsh (c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */ if (0) { } #ifdef GCM_USE_INTEL_VPCLMUL_AVX2 else if ((hw_features & HWF_INTEL_VAES_VPCLMUL) && (hw_features & HWF_INTEL_AVX2)) { c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_USE_VPCLMUL_AVX2; +#ifdef GCM_USE_INTEL_VPCLMUL_AVX512 + if (hw_features & HWF_INTEL_AVX512) + c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_USE_VPCLMUL_AVX512; +#endif + asm volatile (/* H² */ "vinserti128 $1, %%xmm1, %%ymm1, %%ymm1\n\t" /* load H <<< 1, H² <<< 1 */ "vinserti128 $1, 0*16(%[h_table]), %%ymm0, %%ymm0\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul_avx2 (); /* H<<<1•H² => H³, H²<<<1•H² => H⁴ */ asm volatile ("vmovdqu %%ymm1, 2*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */ "vmovdqu %%ymm1, 1*16(%[h_table])\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1, H⁴ <<< 1 */ asm volatile ("vzeroupper\n\t" ::: "memory" ); } #endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ else { asm volatile ("movdqa %%xmm1, %%xmm7\n\t" ::: "memory"); gfmul_pclmul (); /* H<<<1•H² => H³ */ asm volatile ("movdqa %%xmm7, %%xmm0\n\t" "movdqu %%xmm1, 1*16(%[h_table])\n\t" "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */ : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul (); /* H²<<<1•H² => H⁴ */ asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */ "movdqu %%xmm1, 2*16(%[h_table])\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gcm_lsh (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */ gcm_lsh (c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */ } /* Clear/restore used registers. */ asm volatile ("pxor %%xmm0, %%xmm0\n\t" "pxor %%xmm1, %%xmm1\n\t" "pxor %%xmm2, %%xmm2\n\t" "pxor %%xmm3, %%xmm3\n\t" "pxor %%xmm4, %%xmm4\n\t" "pxor %%xmm5, %%xmm5\n\t" "pxor %%xmm6, %%xmm6\n\t" "pxor %%xmm7, %%xmm7\n\t" ::: "memory" ); #ifdef __x86_64__ #ifdef __WIN64__ asm volatile ("movdqu 0*16(%0), %%xmm6\n\t" "movdqu 1*16(%0), %%xmm7\n\t" "movdqu 2*16(%0), %%xmm8\n\t" "movdqu 3*16(%0), %%xmm9\n\t" "movdqu 4*16(%0), %%xmm10\n\t" "movdqu 5*16(%0), %%xmm11\n\t" "movdqu 6*16(%0), %%xmm12\n\t" "movdqu 7*16(%0), %%xmm13\n\t" "movdqu 8*16(%0), %%xmm14\n\t" "movdqu 9*16(%0), %%xmm15\n\t" : : "r" (win64tmp) : "memory" ); #else asm volatile ("pxor %%xmm8, %%xmm8\n\t" "pxor %%xmm9, %%xmm9\n\t" "pxor %%xmm10, %%xmm10\n\t" "pxor %%xmm11, %%xmm11\n\t" "pxor %%xmm12, %%xmm12\n\t" "pxor %%xmm13, %%xmm13\n\t" "pxor %%xmm14, %%xmm14\n\t" "pxor %%xmm15, %%xmm15\n\t" ::: "memory" ); #endif /* __WIN64__ */ #endif /* __x86_64__ */ } #ifdef __x86_64__ static ASM_FUNC_ATTR void ghash_setup_aggr8(gcry_cipher_hd_t c) { c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED; asm volatile ("movdqa 3*16(%[h_table]), %%xmm0\n\t" /* load H⁴ */ "movdqu %[key], %%xmm1\n\t" /* load H <<< 1 */ : : [h_table] "r" (c->u_mode.gcm.gcm_table), [key] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory"); gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */ asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */ : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */ asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t" "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */ : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */ asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t" "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */ : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gfmul_pclmul (); /* H⁴<<<1•H⁴ => H⁸ */ asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t" "movdqu %%xmm1, 7*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */ : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); gcm_lsh (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */ gcm_lsh (c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */ gcm_lsh (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */ gcm_lsh (c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */ } #endif /* __x86_64__ */ unsigned int ASM_FUNC_ATTR _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; const unsigned int blocksize = GCRY_GCM_BLOCK_LEN; #if defined(__x86_64__) && defined(__WIN64__) char win64tmp[10 * 16]; #endif if (nblocks == 0) return 0; #if defined(__x86_64__) && defined(__WIN64__) /* XMM6-XMM15 need to be restored after use. */ asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" "movdqu %%xmm7, 1*16(%0)\n\t" "movdqu %%xmm8, 2*16(%0)\n\t" "movdqu %%xmm9, 3*16(%0)\n\t" "movdqu %%xmm10, 4*16(%0)\n\t" "movdqu %%xmm11, 5*16(%0)\n\t" "movdqu %%xmm12, 6*16(%0)\n\t" "movdqu %%xmm13, 7*16(%0)\n\t" "movdqu %%xmm14, 8*16(%0)\n\t" "movdqu %%xmm15, 9*16(%0)\n\t" : : "r" (win64tmp) : "memory" ); #endif /* Preload hash. */ asm volatile ("movdqa %[be_mask], %%xmm7\n\t" "movdqu %[hash], %%xmm1\n\t" "pshufb %%xmm7, %%xmm1\n\t" /* be => le */ : : [hash] "m" (*result), [be_mask] "m" (*be_mask) : "memory" ); #if defined(GCM_USE_INTEL_VPCLMUL_AVX2) if (nblocks >= 16 - && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2)) + && ((c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2) + || (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))) { - u64 h1_h2_h15_h16[4*2]; - - asm volatile ("vinserti128 $1, %%xmm7, %%ymm7, %%ymm15\n\t" - "vmovdqa %%xmm1, %%xmm8\n\t" - ::: "memory" ); - - if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) +#if defined(GCM_USE_INTEL_VPCLMUL_AVX512) + if (nblocks >= 32 + && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512)) { - ghash_setup_aggr8_avx2 (c); + asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */ + "vshufi64x2 $0, %%zmm7, %%zmm7, %%zmm15\n\t" + "vmovdqa %%xmm1, %%xmm8\n\t" + "vmovdqu64 %[swapperm], %%zmm14\n\t" + : + : [swapperm] "m" (swap128b_perm), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR32_TABLE_INITIALIZED)) + { + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) + { + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + ghash_setup_aggr8_avx512 (c); /* Clobbers registers XMM0-XMM7. */ + + ghash_setup_aggr16_avx512 (c); /* Clobbers registers XMM0-XMM7. */ + } + + ghash_setup_aggr32_avx512 (c); /* Clobbers registers XMM0-XMM7. */ + } + + /* Preload H1-H32. */ + load_h1h4_to_zmm1 (c); + asm volatile ("vpermq %%zmm1, %%zmm14, %%zmm16\n\t" /* H1|H2|H3|H4 */ + "vmovdqa %%xmm8, %%xmm1\n\t" + "vpermq 27*16(%[h_table]), %%zmm14, %%zmm0\n\t" /* H28|H29|H31|H32 */ + "vpermq 23*16(%[h_table]), %%zmm14, %%zmm13\n\t" /* H25|H26|H27|H28 */ + "vpermq 19*16(%[h_table]), %%zmm14, %%zmm12\n\t" /* H21|H22|H23|H24 */ + "vpermq 15*16(%[h_table]), %%zmm14, %%zmm11\n\t" /* H17|H18|H19|H20 */ + "vpermq 11*16(%[h_table]), %%zmm14, %%zmm10\n\t" /* H13|H14|H15|H16 */ + "vpermq 7*16(%[h_table]), %%zmm14, %%zmm9\n\t" /* H9|H10|H11|H12 */ + "vpermq 3*16(%[h_table]), %%zmm14, %%zmm8\n\t" /* H4|H6|H7|H8 */ + : + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + while (nblocks >= 32) + { + gfmul_vpclmul_avx512_aggr32 (buf, c->u_mode.gcm.gcm_table); + + buf += 32 * blocksize; + nblocks -= 32; + } + + asm volatile ("vmovdqa %%xmm15, %%xmm7\n\t" + "vpxorq %%zmm16, %%zmm16, %%zmm16\n\t" + "vpxorq %%zmm17, %%zmm17, %%zmm17\n\t" + "vpxorq %%zmm18, %%zmm18, %%zmm18\n\t" + "vpxorq %%zmm19, %%zmm19, %%zmm19\n\t" + : + : + : "memory" ); } - if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) +#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */ + + if (nblocks >= 16) { - ghash_setup_aggr16_avx2 (c); - } + u64 h1_h2_h15_h16[4*2]; - /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */ - asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t" - "vmovdqu 0*16(%[h_table]), %%xmm7\n\t" - "vpxor %%xmm8, %%xmm8, %%xmm8\n\t" - "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */ - "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */ - "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */ - "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */ - "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */ - "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */ - "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */ - "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */ - "vmovdqu %%ymm0, %[h15_h16]\n\t" - "vmovdqu %%ymm7, %[h1_h2]\n\t" - : [h1_h2] "=m" (h1_h2_h15_h16[0]), - [h15_h16] "=m" (h1_h2_h15_h16[4]) - : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), - [h_table] "r" (c->u_mode.gcm.gcm_table) - : "memory" ); + asm volatile ("vinserti128 $1, %%xmm7, %%ymm7, %%ymm15\n\t" + "vmovdqa %%xmm1, %%xmm8\n\t" + ::: "memory" ); - while (nblocks >= 16) - { - gfmul_vpclmul_avx2_aggr16 (buf, c->u_mode.gcm.gcm_table, - h1_h2_h15_h16); + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) + { + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + ghash_setup_aggr8_avx2 (c); /* Clobbers registers XMM0-XMM7. */ + + ghash_setup_aggr16_avx2 (c); /* Clobbers registers XMM0-XMM7. */ + } + + /* Preload H1-H16. */ + load_h1h2_to_ymm1 (c); + asm volatile ("vperm2i128 $0x23, %%ymm1, %%ymm1, %%ymm7\n\t" /* H1|H2 */ + "vmovdqa %%xmm8, %%xmm1\n\t" + "vpxor %%xmm8, %%xmm8, %%xmm8\n\t" + "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */ + "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */ + "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */ + "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */ + "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */ + "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */ + "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */ + "vmovdqu %%ymm0, %[h15_h16]\n\t" + "vmovdqu %%ymm7, %[h1_h2]\n\t" + : [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + while (nblocks >= 16) + { + gfmul_vpclmul_avx2_aggr16 (buf, c->u_mode.gcm.gcm_table, + h1_h2_h15_h16); - buf += 16 * blocksize; - nblocks -= 16; + buf += 16 * blocksize; + nblocks -= 16; + } + + asm volatile ("vmovdqu %%ymm15, %[h15_h16]\n\t" + "vmovdqu %%ymm15, %[h1_h2]\n\t" + "vmovdqa %%xmm15, %%xmm7\n\t" + : + [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : + : "memory" ); } - /* Clear used x86-64/XMM registers. */ - asm volatile("vmovdqu %%ymm15, %[h15_h16]\n\t" - "vmovdqu %%ymm15, %[h1_h2]\n\t" - "vzeroupper\n\t" -#ifndef __WIN64__ - "pxor %%xmm8, %%xmm8\n\t" - "pxor %%xmm9, %%xmm9\n\t" - "pxor %%xmm10, %%xmm10\n\t" - "pxor %%xmm11, %%xmm11\n\t" - "pxor %%xmm12, %%xmm12\n\t" - "pxor %%xmm13, %%xmm13\n\t" - "pxor %%xmm14, %%xmm14\n\t" - "pxor %%xmm15, %%xmm15\n\t" -#endif - "movdqa %[be_mask], %%xmm7\n\t" - : [h1_h2] "=m" (h1_h2_h15_h16[0]), - [h15_h16] "=m" (h1_h2_h15_h16[4]) - : [be_mask] "m" (*be_mask) - : "memory" ); + asm volatile ("vzeroupper\n\t" ::: "memory" ); } #endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ #ifdef __x86_64__ if (nblocks >= 8) { asm volatile ("movdqa %%xmm7, %%xmm15\n\t" + "movdqa %%xmm1, %%xmm8\n\t" ::: "memory" ); if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) - { - asm volatile ("movdqa %%xmm1, %%xmm8\n\t" - ::: "memory" ); - ghash_setup_aggr8 (c); - asm volatile ("movdqa %%xmm8, %%xmm1\n\t" - ::: "memory" ); - } + ghash_setup_aggr8 (c); /* Clobbers registers XMM0-XMM7. */ /* Preload H1. */ - asm volatile ("movdqa %[h_1], %%xmm0\n\t" - : - : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) - : "memory" ); + asm volatile ("movdqa %%xmm8, %%xmm1\n\t" + "movdqa %[h_1], %%xmm0\n\t" + : + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory" ); while (nblocks >= 8) { gfmul_pclmul_aggr8 (buf, c->u_mode.gcm.gcm_table); buf += 8 * blocksize; nblocks -= 8; } - -#ifndef __WIN64__ - /* Clear used x86-64/XMM registers. */ - asm volatile( "pxor %%xmm8, %%xmm8\n\t" - "pxor %%xmm9, %%xmm9\n\t" - "pxor %%xmm10, %%xmm10\n\t" - "pxor %%xmm11, %%xmm11\n\t" - "pxor %%xmm12, %%xmm12\n\t" - "pxor %%xmm13, %%xmm13\n\t" - "pxor %%xmm14, %%xmm14\n\t" - "pxor %%xmm15, %%xmm15\n\t" - ::: "memory" ); -#endif } #endif /* __x86_64__ */ while (nblocks >= 4) { gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.u_ghash_key.key, c->u_mode.gcm.gcm_table, be_mask); buf += 4 * blocksize; nblocks -= 4; } if (nblocks) { /* Preload H1. */ asm volatile ("movdqa %[h_1], %%xmm0\n\t" : : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory" ); while (nblocks) { asm volatile ("movdqu %[buf], %%xmm2\n\t" "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ "pxor %%xmm2, %%xmm1\n\t" : : [buf] "m" (*buf), [be_mask] "m" (*be_mask) : "memory" ); gfmul_pclmul (); buf += blocksize; nblocks--; } } /* Store hash. */ asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */ "movdqu %%xmm1, %[hash]\n\t" : [hash] "=m" (*result) : [be_mask] "m" (*be_mask) : "memory" ); -#if defined(__x86_64__) && defined(__WIN64__) /* Clear/restore used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "movdqu 0*16(%0), %%xmm6\n\t" - "movdqu 1*16(%0), %%xmm7\n\t" - "movdqu 2*16(%0), %%xmm8\n\t" - "movdqu 3*16(%0), %%xmm9\n\t" - "movdqu 4*16(%0), %%xmm10\n\t" - "movdqu 5*16(%0), %%xmm11\n\t" - "movdqu 6*16(%0), %%xmm12\n\t" - "movdqu 7*16(%0), %%xmm13\n\t" - "movdqu 8*16(%0), %%xmm14\n\t" - "movdqu 9*16(%0), %%xmm15\n\t" - : - : "r" (win64tmp) - : "memory" ); + asm volatile ("pxor %%xmm0, %%xmm0\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "pxor %%xmm2, %%xmm2\n\t" + "pxor %%xmm3, %%xmm3\n\t" + "pxor %%xmm4, %%xmm4\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm7\n\t" + : + : + : "memory" ); +#ifdef __x86_64__ +#ifdef __WIN64__ + asm volatile ("movdqu 0*16(%0), %%xmm6\n\t" + "movdqu 1*16(%0), %%xmm7\n\t" + "movdqu 2*16(%0), %%xmm8\n\t" + "movdqu 3*16(%0), %%xmm9\n\t" + "movdqu 4*16(%0), %%xmm10\n\t" + "movdqu 5*16(%0), %%xmm11\n\t" + "movdqu 6*16(%0), %%xmm12\n\t" + "movdqu 7*16(%0), %%xmm13\n\t" + "movdqu 8*16(%0), %%xmm14\n\t" + "movdqu 9*16(%0), %%xmm15\n\t" + : + : "r" (win64tmp) + : "memory" ); #else /* Clear used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "pxor %%xmm7, %%xmm7\n\t" - ::: "memory" ); -#endif + asm volatile ( + "pxor %%xmm8, %%xmm8\n\t" + "pxor %%xmm9, %%xmm9\n\t" + "pxor %%xmm10, %%xmm10\n\t" + "pxor %%xmm11, %%xmm11\n\t" + "pxor %%xmm12, %%xmm12\n\t" + "pxor %%xmm13, %%xmm13\n\t" + "pxor %%xmm14, %%xmm14\n\t" + "pxor %%xmm15, %%xmm15\n\t" + : + : + : "memory" ); +#endif /* __WIN64__ */ +#endif /* __x86_64__ */ return 0; } unsigned int ASM_FUNC_ATTR _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks) { static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; const unsigned int blocksize = GCRY_GCM_BLOCK_LEN; #if defined(__x86_64__) && defined(__WIN64__) char win64tmp[10 * 16]; #endif if (nblocks == 0) return 0; #if defined(__x86_64__) && defined(__WIN64__) /* XMM6-XMM15 need to be restored after use. */ asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" "movdqu %%xmm7, 1*16(%0)\n\t" "movdqu %%xmm8, 2*16(%0)\n\t" "movdqu %%xmm9, 3*16(%0)\n\t" "movdqu %%xmm10, 4*16(%0)\n\t" "movdqu %%xmm11, 5*16(%0)\n\t" "movdqu %%xmm12, 6*16(%0)\n\t" "movdqu %%xmm13, 7*16(%0)\n\t" "movdqu %%xmm14, 8*16(%0)\n\t" "movdqu %%xmm15, 9*16(%0)\n\t" : : "r" (win64tmp) : "memory" ); #endif /* Preload hash. */ asm volatile ("pxor %%xmm7, %%xmm7\n\t" "movdqu %[hash], %%xmm1\n\t" "pshufb %[be_mask], %%xmm1\n\t" /* be => le */ : : [hash] "m" (*result), [be_mask] "m" (*be_mask) : "memory" ); #if defined(GCM_USE_INTEL_VPCLMUL_AVX2) if (nblocks >= 16 - && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2)) + && ((c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2) + || (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))) { - u64 h1_h2_h15_h16[4*2]; - - asm volatile ("vmovdqa %%xmm1, %%xmm8\n\t" - ::: "memory" ); - - if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) +#if defined(GCM_USE_INTEL_VPCLMUL_AVX512) + if (nblocks >= 32 + && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512)) { - ghash_setup_aggr8_avx2 (c); + asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */ + "vmovdqa %%xmm1, %%xmm8\n\t" + "vmovdqu64 %[swapperm], %%zmm14\n\t" + : + : [swapperm] "m" (swap128b_perm), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR32_TABLE_INITIALIZED)) + { + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) + { + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + ghash_setup_aggr8_avx512 (c); /* Clobbers registers XMM0-XMM7. */ + + ghash_setup_aggr16_avx512 (c); /* Clobbers registers XMM0-XMM7. */ + } + + ghash_setup_aggr32_avx512 (c); /* Clobbers registers XMM0-XMM7. */ + } + + /* Preload H1-H32. */ + load_h1h4_to_zmm1 (c); + asm volatile ("vpermq %%zmm1, %%zmm14, %%zmm16\n\t" /* H1|H2|H3|H4 */ + "vmovdqa %%xmm8, %%xmm1\n\t" + "vpermq 27*16(%[h_table]), %%zmm14, %%zmm0\n\t" /* H28|H29|H31|H32 */ + "vpermq 23*16(%[h_table]), %%zmm14, %%zmm13\n\t" /* H25|H26|H27|H28 */ + "vpermq 19*16(%[h_table]), %%zmm14, %%zmm12\n\t" /* H21|H22|H23|H24 */ + "vpermq 15*16(%[h_table]), %%zmm14, %%zmm11\n\t" /* H17|H18|H19|H20 */ + "vpermq 11*16(%[h_table]), %%zmm14, %%zmm10\n\t" /* H13|H14|H15|H16 */ + "vpermq 7*16(%[h_table]), %%zmm14, %%zmm9\n\t" /* H9|H10|H11|H12 */ + "vpermq 3*16(%[h_table]), %%zmm14, %%zmm8\n\t" /* H4|H6|H7|H8 */ + : + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + while (nblocks >= 32) + { + gfmul_vpclmul_avx512_aggr32_le (buf, c->u_mode.gcm.gcm_table); + + buf += 32 * blocksize; + nblocks -= 32; + } + + asm volatile ("vpxor %%xmm7, %%xmm7, %%xmm7\n\t" + "vpxorq %%zmm16, %%zmm16, %%zmm16\n\t" + "vpxorq %%zmm17, %%zmm17, %%zmm17\n\t" + "vpxorq %%zmm18, %%zmm18, %%zmm18\n\t" + "vpxorq %%zmm19, %%zmm19, %%zmm19\n\t" + : + : + : "memory" ); } - if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) - { - ghash_setup_aggr16_avx2 (c); - } - - /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */ - asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t" - "vpxor %%xmm8, %%xmm8, %%xmm8\n\t" - "vmovdqu 0*16(%[h_table]), %%xmm7\n\t" - "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */ - "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */ - "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */ - "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */ - "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */ - "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */ - "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */ - "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */ - "vmovdqu %%ymm0, %[h15_h16]\n\t" - "vmovdqu %%ymm7, %[h1_h2]\n\t" - : [h1_h2] "=m" (h1_h2_h15_h16[0]), - [h15_h16] "=m" (h1_h2_h15_h16[4]) - : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), - [h_table] "r" (c->u_mode.gcm.gcm_table) - : "memory" ); +#endif - while (nblocks >= 16) + if (nblocks >= 16) { - gfmul_vpclmul_avx2_aggr16_le (buf, c->u_mode.gcm.gcm_table, - h1_h2_h15_h16); + u64 h1_h2_h15_h16[4*2]; + + asm volatile ("vmovdqa %%xmm1, %%xmm8\n\t" + ::: "memory" ); - buf += 16 * blocksize; - nblocks -= 16; + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED)) + { + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) + ghash_setup_aggr8_avx2 (c); /* Clobbers registers XMM0-XMM7. */ + + ghash_setup_aggr16_avx2 (c); /* Clobbers registers XMM0-XMM7. */ + } + + /* Preload H1-H16. */ + load_h1h2_to_ymm1 (c); + asm volatile ("vperm2i128 $0x23, %%ymm1, %%ymm1, %%ymm7\n\t" /* H1|H2 */ + "vmovdqa %%xmm8, %%xmm1\n\t" + "vpxor %%xmm8, %%xmm8, %%xmm8\n\t" + "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */ + "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */ + "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */ + "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */ + "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */ + "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */ + "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */ + "vmovdqu %%ymm0, %[h15_h16]\n\t" + "vmovdqu %%ymm7, %[h1_h2]\n\t" + : [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key), + [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory" ); + + while (nblocks >= 16) + { + gfmul_vpclmul_avx2_aggr16_le (buf, c->u_mode.gcm.gcm_table, + h1_h2_h15_h16); + + buf += 16 * blocksize; + nblocks -= 16; + } + + asm volatile ("vpxor %%xmm7, %%xmm7, %%xmm7\n\t" + "vmovdqu %%ymm7, %[h15_h16]\n\t" + "vmovdqu %%ymm7, %[h1_h2]\n\t" + : [h1_h2] "=m" (h1_h2_h15_h16[0]), + [h15_h16] "=m" (h1_h2_h15_h16[4]) + : + : "memory" ); } - /* Clear used x86-64/XMM registers. */ - asm volatile("vpxor %%xmm7, %%xmm7, %%xmm7\n\t" - "vmovdqu %%ymm7, %[h15_h16]\n\t" - "vmovdqu %%ymm7, %[h1_h2]\n\t" - "vzeroupper\n\t" -#ifndef __WIN64__ - "pxor %%xmm8, %%xmm8\n\t" - "pxor %%xmm9, %%xmm9\n\t" - "pxor %%xmm10, %%xmm10\n\t" - "pxor %%xmm11, %%xmm11\n\t" - "pxor %%xmm12, %%xmm12\n\t" - "pxor %%xmm13, %%xmm13\n\t" - "pxor %%xmm14, %%xmm14\n\t" -#endif - : [h1_h2] "=m" (h1_h2_h15_h16[0]), - [h15_h16] "=m" (h1_h2_h15_h16[4]) - : - : "memory" ); + asm volatile ("vzeroupper\n\t" ::: "memory" ); } #endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ #ifdef __x86_64__ if (nblocks >= 8) { + asm volatile ("movdqa %%xmm1, %%xmm8\n\t" + ::: "memory" ); + if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED)) - { - asm volatile ("movdqa %%xmm1, %%xmm8\n\t" - ::: "memory" ); - ghash_setup_aggr8 (c); - asm volatile ("movdqa %%xmm8, %%xmm1\n\t" - ::: "memory" ); - } + ghash_setup_aggr8 (c); /* Clobbers registers XMM0-XMM7. */ /* Preload H1. */ - asm volatile ("pxor %%xmm15, %%xmm15\n\t" - "movdqa %[h_1], %%xmm0\n\t" - : - : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) - : "memory" ); + asm volatile ("movdqa %%xmm8, %%xmm1\n\t" + "pxor %%xmm15, %%xmm15\n\t" + "movdqa %[h_1], %%xmm0\n\t" + : + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory" ); while (nblocks >= 8) { gfmul_pclmul_aggr8_le (buf, c->u_mode.gcm.gcm_table); buf += 8 * blocksize; nblocks -= 8; } -#ifndef __WIN64__ - /* Clear used x86-64/XMM registers. */ - asm volatile( "pxor %%xmm8, %%xmm8\n\t" - "pxor %%xmm9, %%xmm9\n\t" - "pxor %%xmm10, %%xmm10\n\t" - "pxor %%xmm11, %%xmm11\n\t" - "pxor %%xmm12, %%xmm12\n\t" - "pxor %%xmm13, %%xmm13\n\t" - "pxor %%xmm14, %%xmm14\n\t" - "pxor %%xmm15, %%xmm15\n\t" - ::: "memory" ); -#endif } #endif while (nblocks >= 4) { gfmul_pclmul_aggr4_le (buf, c->u_mode.gcm.u_ghash_key.key, c->u_mode.gcm.gcm_table); buf += 4 * blocksize; nblocks -= 4; } if (nblocks) { /* Preload H1. */ asm volatile ("movdqa %[h_1], %%xmm0\n\t" : : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory" ); while (nblocks) { asm volatile ("movdqu %[buf], %%xmm2\n\t" "pxor %%xmm2, %%xmm1\n\t" : : [buf] "m" (*buf) : "memory" ); gfmul_pclmul (); buf += blocksize; nblocks--; } } /* Store hash. */ asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */ "movdqu %%xmm1, %[hash]\n\t" : [hash] "=m" (*result) : [be_mask] "m" (*be_mask) : "memory" ); -#if defined(__x86_64__) && defined(__WIN64__) /* Clear/restore used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "movdqu 0*16(%0), %%xmm6\n\t" - "movdqu 1*16(%0), %%xmm7\n\t" - "movdqu 2*16(%0), %%xmm8\n\t" - "movdqu 3*16(%0), %%xmm9\n\t" - "movdqu 4*16(%0), %%xmm10\n\t" - "movdqu 5*16(%0), %%xmm11\n\t" - "movdqu 6*16(%0), %%xmm12\n\t" - "movdqu 7*16(%0), %%xmm13\n\t" - "movdqu 8*16(%0), %%xmm14\n\t" - "movdqu 9*16(%0), %%xmm15\n\t" - : - : "r" (win64tmp) - : "memory" ); + asm volatile ("pxor %%xmm0, %%xmm0\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "pxor %%xmm2, %%xmm2\n\t" + "pxor %%xmm3, %%xmm3\n\t" + "pxor %%xmm4, %%xmm4\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm7\n\t" + : + : + : "memory" ); +#ifdef __x86_64__ +#ifdef __WIN64__ + asm volatile ("movdqu 0*16(%0), %%xmm6\n\t" + "movdqu 1*16(%0), %%xmm7\n\t" + "movdqu 2*16(%0), %%xmm8\n\t" + "movdqu 3*16(%0), %%xmm9\n\t" + "movdqu 4*16(%0), %%xmm10\n\t" + "movdqu 5*16(%0), %%xmm11\n\t" + "movdqu 6*16(%0), %%xmm12\n\t" + "movdqu 7*16(%0), %%xmm13\n\t" + "movdqu 8*16(%0), %%xmm14\n\t" + "movdqu 9*16(%0), %%xmm15\n\t" + : + : "r" (win64tmp) + : "memory" ); #else /* Clear used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "pxor %%xmm7, %%xmm7\n\t" - ::: "memory" ); -#endif + asm volatile ( + "pxor %%xmm8, %%xmm8\n\t" + "pxor %%xmm9, %%xmm9\n\t" + "pxor %%xmm10, %%xmm10\n\t" + "pxor %%xmm11, %%xmm11\n\t" + "pxor %%xmm12, %%xmm12\n\t" + "pxor %%xmm13, %%xmm13\n\t" + "pxor %%xmm14, %%xmm14\n\t" + "pxor %%xmm15, %%xmm15\n\t" + : + : + : "memory" ); +#endif /* __WIN64__ */ +#endif /* __x86_64__ */ return 0; } #if __clang__ # pragma clang attribute pop #endif #endif /* GCM_USE_INTEL_PCLMUL */ diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index e31ac860..e1ff0437 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -1,957 +1,965 @@ /* cipher-internal.h - Internal defs for cipher.c * Copyright (C) 2011 Free Software Foundation, Inc. * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser general Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifndef G10_CIPHER_INTERNAL_H #define G10_CIPHER_INTERNAL_H #include "./poly1305-internal.h" /* The maximum supported size of a block in bytes. */ #define MAX_BLOCKSIZE 16 /* The length for an OCB block. Although OCB supports any block length it does not make sense to use a 64 bit blocklen (and cipher) because this reduces the security margin to an unacceptable state. Thus we require a cipher with 128 bit blocklength. */ #define OCB_BLOCK_LEN (128/8) /* The size of the pre-computed L table for OCB. This takes the same size as the table used for GCM and thus we don't save anything by not using such a table. */ #define OCB_L_TABLE_SIZE 16 /* Check the above constants. */ #if OCB_BLOCK_LEN > MAX_BLOCKSIZE # error OCB_BLOCKLEN > MAX_BLOCKSIZE #endif /* Magic values for the context structure. */ #define CTX_MAGIC_NORMAL 0x24091964 #define CTX_MAGIC_SECURE 0x46919042 /* Try to use 16 byte aligned cipher context for better performance. We use the aligned attribute, thus it is only possible to implement this with gcc. */ #undef NEED_16BYTE_ALIGNED_CONTEXT #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED # define NEED_16BYTE_ALIGNED_CONTEXT 1 #endif /* Undef this symbol to trade GCM speed for 256 bytes of memory per context */ #define GCM_USE_TABLES 1 /* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL code. */ #undef GCM_USE_INTEL_PCLMUL #if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES) # if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)) # if __GNUC__ >= 4 # define GCM_USE_INTEL_PCLMUL 1 # endif # endif #endif /* GCM_USE_INTEL_PCLMUL */ /* GCM_USE_INTEL_VPCLMUL_AVX2 indicates whether to compile GCM with Intel VPCLMUL/AVX2 code. */ #undef GCM_USE_INTEL_VPCLMUL_AVX2 #if defined(__x86_64__) && defined(GCM_USE_INTEL_PCLMUL) && \ defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) # define GCM_USE_INTEL_VPCLMUL_AVX2 1 #endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */ +/* GCM_USE_INTEL_VPCLMUL_AVX512 indicates whether to compile GCM with Intel + VPCLMUL/AVX512 code. */ +#undef GCM_USE_INTEL_VPCLMUL_AVX512 +#if defined(__x86_64__) && defined(GCM_USE_INTEL_VPCLMUL_AVX2) && \ + defined(ENABLE_AVX512_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_AVX512) +# define GCM_USE_INTEL_VPCLMUL_AVX512 1 +#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */ + /* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */ #undef GCM_USE_ARM_PMULL #if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES) # if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) # define GCM_USE_ARM_PMULL 1 # elif defined(__AARCH64EL__) && \ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) # define GCM_USE_ARM_PMULL 1 # endif #endif /* GCM_USE_ARM_PMULL */ /* GCM_USE_ARM_NEON indicates whether to compile GCM with ARMv7 NEON code. */ #undef GCM_USE_ARM_NEON #if defined(GCM_USE_TABLES) #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_NEON) # define GCM_USE_ARM_NEON 1 #endif #endif /* GCM_USE_ARM_NEON */ /* GCM_USE_S390X_CRYPTO indicates whether to enable zSeries code. */ #undef GCM_USE_S390X_CRYPTO #if defined(HAVE_GCC_INLINE_ASM_S390X) # define GCM_USE_S390X_CRYPTO 1 #endif /* GCM_USE_S390X_CRYPTO */ /* GCM_USE_PPC_VPMSUM indicates whether to compile GCM with PPC Power 8 * polynomial multiplication instruction. */ #undef GCM_USE_PPC_VPMSUM #if defined(GCM_USE_TABLES) #if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \ defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4 # define GCM_USE_PPC_VPMSUM 1 # define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */ #endif #endif /* GCM_USE_PPC_VPMSUM */ typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks); /* A structure with function pointers for mode operations. */ typedef struct cipher_mode_ops { gcry_err_code_t (*encrypt)(gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t (*decrypt)(gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t (*setiv)(gcry_cipher_hd_t c, const unsigned char *iv, size_t ivlen); gcry_err_code_t (*authenticate)(gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t (*get_tag)(gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t (*check_tag)(gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); } cipher_mode_ops_t; /* A structure with function pointers for bulk operations. The cipher algorithm setkey function initializes them when bulk operations are available and the actual encryption routines use them if they are not NULL. */ typedef struct cipher_bulk_ops { void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*cbc_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int cbc_mac); void (*cbc_dec)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*ofb_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*ctr_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*ctr32le_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); size_t (*ocb_crypt)(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); void (*xts_crypt)(void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); size_t (*gcm_crypt)(gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); } cipher_bulk_ops_t; /* A VIA processor with the Padlock engine as well as the Intel AES_NI instructions require an alignment of most data on a 16 byte boundary. Because we trick out the compiler while allocating the context, the align attribute as used in rijndael.c does not work on its own. Thus we need to make sure that the entire context structure is a aligned on that boundary. We achieve this by defining a new type and use that instead of our usual alignment type. */ typedef union { PROPERLY_ALIGNED_TYPE foo; #ifdef NEED_16BYTE_ALIGNED_CONTEXT char bar[16] __attribute__ ((aligned (16))); #endif char c[1]; } cipher_context_alignment_t; /* Storage structure for CMAC, for CMAC and EAX modes. */ typedef struct { /* The initialization vector. Also contains tag after finalization. */ union { cipher_context_alignment_t iv_align; unsigned char iv[MAX_BLOCKSIZE]; } u_iv; /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */ unsigned char subkeys[2][MAX_BLOCKSIZE]; /* Space to save partial input lengths for MAC. */ unsigned char macbuf[MAX_BLOCKSIZE]; int mac_unused; /* Number of unprocessed bytes in MACBUF. */ unsigned int tag:1; /* Set to 1 if tag has been finalized. */ } gcry_cmac_context_t; /* The handle structure. */ struct gcry_cipher_handle { int magic; size_t actual_handle_size; /* Allocated size of this handle. */ size_t handle_offset; /* Offset to the malloced block. */ gcry_cipher_spec_t *spec; /* The algorithm id. This is a hack required because the module interface does not easily allow to retrieve this value. */ int algo; /* A structure with function pointers for mode operations. */ cipher_mode_ops_t mode_ops; /* A structure with function pointers for bulk operations. Due to limitations of the module system (we don't want to change the API) we need to keep these function pointers here. */ cipher_bulk_ops_t bulk; int mode; unsigned int flags; struct { unsigned int key:1; /* Set to 1 if a key has been set. */ unsigned int iv:1; /* Set to 1 if a IV has been set. */ unsigned int tag:1; /* Set to 1 if a tag is finalized. */ unsigned int finalize:1; /* Next encrypt/decrypt has the final data. */ unsigned int allow_weak_key:1; /* Set to 1 if weak keys are allowed. */ } marks; /* The initialization vector. For best performance we make sure that it is properly aligned. In particular some implementations of bulk operations expect an 16 byte aligned IV. IV is also used to store CBC-MAC in CCM mode; counter IV is stored in U_CTR. For OCB mode it is used for the offset value. */ union { cipher_context_alignment_t iv_align; unsigned char iv[MAX_BLOCKSIZE]; } u_iv; /* The counter for CTR mode. This field is also used by AESWRAP and thus we can't use the U_IV union. For OCB mode it is used for the checksum. */ union { cipher_context_alignment_t iv_align; unsigned char ctr[MAX_BLOCKSIZE]; } u_ctr; /* Space to save an IV or CTR for chaining operations. */ unsigned char lastiv[MAX_BLOCKSIZE]; int unused; /* Number of unused bytes in LASTIV. */ union { /* Mode specific storage for CCM mode. */ struct { u64 encryptlen; u64 aadlen; unsigned int authlen; /* Space to save partial input lengths for MAC. */ unsigned char macbuf[GCRY_CCM_BLOCK_LEN]; int mac_unused; /* Number of unprocessed bytes in MACBUF. */ unsigned char s0[GCRY_CCM_BLOCK_LEN]; unsigned int nonce:1; /* Set to 1 if nonce has been set. */ unsigned int lengths:1; /* Set to 1 if CCM length parameters has been processed. */ } ccm; /* Mode specific storage for Poly1305 mode. */ struct { /* byte counter for AAD. */ u32 aadcount[2]; /* byte counter for data. */ u32 datacount[2]; unsigned int aad_finalized:1; unsigned int bytecount_over_limits:1; poly1305_context_t ctx; } poly1305; /* Mode specific storage for CMAC mode. */ gcry_cmac_context_t cmac; /* Mode specific storage for EAX mode. */ struct { /* CMAC for header (AAD). */ gcry_cmac_context_t cmac_header; /* CMAC for ciphertext. */ gcry_cmac_context_t cmac_ciphertext; } eax; /* Mode specific storage for GCM mode and GCM-SIV mode. */ struct { /* The interim tag for GCM mode. */ union { cipher_context_alignment_t iv_align; unsigned char tag[MAX_BLOCKSIZE]; } u_tag; /* Space to save partial input lengths for MAC. */ unsigned char macbuf[GCRY_CCM_BLOCK_LEN]; int mac_unused; /* Number of unprocessed bytes in MACBUF. */ /* byte counters for GCM */ u32 aadlen[2]; u32 datalen[2]; /* encrypted tag counter */ unsigned char tagiv[MAX_BLOCKSIZE]; unsigned int ghash_data_finalized:1; unsigned int ghash_aad_finalized:1; unsigned int datalen_over_limits:1; unsigned int disallow_encryption_because_of_setiv_in_fips_mode:1; /* --- Following members are not cleared in gcry_cipher_reset --- */ /* GHASH multiplier from key. */ union { cipher_context_alignment_t iv_align; unsigned char key[MAX_BLOCKSIZE]; } u_ghash_key; /* Pre-calculated table for GCM. */ #ifdef GCM_USE_TABLES #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__)) #define GCM_TABLES_USE_U64 1 u64 gcm_table[4 * 16]; #else #undef GCM_TABLES_USE_U64 u32 gcm_table[8 * 16]; #endif #endif /* GHASH implementation in use. */ ghash_fn_t ghash_fn; /* POLYVAL implementation in use (GCM-SIV). */ ghash_fn_t polyval_fn; /* Key length used for GCM-SIV key generating key. */ unsigned int siv_keylen; /* Flags for accelerated implementations. */ unsigned int hw_impl_flags; } gcm; /* Mode specific storage for OCB mode. */ struct { /* --- Following members are not cleared in gcry_cipher_reset --- */ /* Helper variables and pre-computed table of L values. */ unsigned char L_star[OCB_BLOCK_LEN]; unsigned char L_dollar[OCB_BLOCK_LEN]; unsigned char L0L1[OCB_BLOCK_LEN]; unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN]; /* --- Following members are cleared in gcry_cipher_reset --- */ /* The tag is valid if marks.tag has been set. */ unsigned char tag[OCB_BLOCK_LEN]; /* A buffer to hold the offset for the AAD processing. */ unsigned char aad_offset[OCB_BLOCK_LEN]; /* A buffer to hold the current sum of AAD processing. We can't use tag here because tag may already hold the preprocessed checksum of the data. */ unsigned char aad_sum[OCB_BLOCK_LEN]; /* A buffer to store AAD data not yet processed. */ unsigned char aad_leftover[OCB_BLOCK_LEN]; /* Number of data/aad blocks processed so far. */ u64 data_nblocks; u64 aad_nblocks; /* Number of valid bytes in AAD_LEFTOVER. */ unsigned char aad_nleftover; /* Length of the tag. Fixed for now but may eventually be specified using a set of gcry_cipher_flags. */ unsigned char taglen; /* Flags indicating that the final data/aad block has been processed. */ unsigned int data_finalized:1; unsigned int aad_finalized:1; } ocb; /* Mode specific storage for XTS mode. */ struct { /* Pointer to tweak cipher context, allocated after actual * cipher context. */ char *tweak_context; } xts; /* Mode specific storage for SIV mode. */ struct { /* Tag used for decryption. */ unsigned char dec_tag[GCRY_SIV_BLOCK_LEN]; /* S2V state. */ unsigned char s2v_d[GCRY_SIV_BLOCK_LEN]; /* Number of AAD elements processed. */ unsigned int aad_count:8; /* Flags for SIV state. */ unsigned int dec_tag_set:1; /* --- Following members are not cleared in gcry_cipher_reset --- */ /* S2V CMAC state. */ gcry_cmac_context_t s2v_cmac; unsigned char s2v_zero_block[GCRY_SIV_BLOCK_LEN]; /* Pointer to CTR cipher context, allocated after actual * cipher context. */ char *ctr_context; } siv; /* Mode specific storage for WRAP mode. */ struct { unsigned char plen[4]; } wrap; } u_mode; /* What follows are two contexts of the cipher in use. The first one needs to be aligned well enough for the cipher operation whereas the second one is a copy created by cipher_setkey and used by cipher_reset. That second copy has no need for proper aligment because it is only accessed by memcpy. */ cipher_context_alignment_t context; }; /*-- cipher-cbc.c --*/ gcry_err_code_t _gcry_cipher_cbc_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cbc_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cbc_cts_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cbc_cts_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-cfb.c --*/ gcry_err_code_t _gcry_cipher_cfb_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cfb_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cfb8_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_cfb8_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-ofb.c --*/ gcry_err_code_t _gcry_cipher_ofb_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-ctr.c --*/ gcry_err_code_t _gcry_cipher_ctr_encrypt_ctx /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen, void *algo_context); gcry_err_code_t _gcry_cipher_ctr_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-aeswrap.c --*/ gcry_err_code_t _gcry_cipher_keywrap_encrypt /* */ (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, const byte *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_keywrap_encrypt_padding /* */ (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, const byte *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_keywrap_decrypt_auto /* */ (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen, const byte *inbuf, size_t inbuflen); /*-- cipher-ccm.c --*/ gcry_err_code_t _gcry_cipher_ccm_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ccm_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ccm_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_ccm_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t _gcry_cipher_ccm_set_lengths /* */ (gcry_cipher_hd_t c, u64 encryptedlen, u64 aadlen, u64 taglen); gcry_err_code_t _gcry_cipher_ccm_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_ccm_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); /*-- cipher-cmac.c --*/ gcry_err_code_t _gcry_cmac_generate_subkeys /* */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx); gcry_err_code_t _gcry_cmac_write /* */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx, const byte * inbuf, size_t inlen); gcry_err_code_t _gcry_cmac_final /* */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx); void _gcry_cmac_reset (gcry_cmac_context_t *ctx); /*-- cipher-eax.c --*/ gcry_err_code_t _gcry_cipher_eax_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_eax_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_eax_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_eax_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *aadbuf, size_t aadbuflen); gcry_err_code_t _gcry_cipher_eax_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_eax_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); gcry_err_code_t _gcry_cipher_eax_setkey /* */ (gcry_cipher_hd_t c); /*-- cipher-gcm.c --*/ gcry_err_code_t _gcry_cipher_gcm_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_gcm_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_gcm_setiv /* */ (gcry_cipher_hd_t c, const unsigned char *iv, size_t ivlen); gcry_err_code_t _gcry_cipher_gcm_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *aadbuf, size_t aadbuflen); gcry_err_code_t _gcry_cipher_gcm_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_gcm_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); void _gcry_cipher_gcm_setkey /* */ (gcry_cipher_hd_t c); void _gcry_cipher_gcm_setupM /* */ (gcry_cipher_hd_t c); /*-- cipher-poly1305.c --*/ gcry_err_code_t _gcry_cipher_poly1305_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_poly1305_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_poly1305_setiv /* */ (gcry_cipher_hd_t c, const unsigned char *iv, size_t ivlen); gcry_err_code_t _gcry_cipher_poly1305_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *aadbuf, size_t aadbuflen); gcry_err_code_t _gcry_cipher_poly1305_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_poly1305_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); void _gcry_cipher_poly1305_setkey /* */ (gcry_cipher_hd_t c); /*-- chacha20.c --*/ gcry_err_code_t _gcry_chacha20_poly1305_encrypt /* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length); gcry_err_code_t _gcry_chacha20_poly1305_decrypt /* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf, size_t length); /*-- cipher-ocb.c --*/ gcry_err_code_t _gcry_cipher_ocb_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ocb_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_ocb_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_ocb_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t _gcry_cipher_ocb_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_ocb_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); void _gcry_cipher_ocb_setkey /* */ (gcry_cipher_hd_t c); /*-- cipher-xts.c --*/ gcry_err_code_t _gcry_cipher_xts_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_xts_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); /*-- cipher-siv.c --*/ gcry_err_code_t _gcry_cipher_siv_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_siv_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_siv_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_siv_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t _gcry_cipher_siv_set_decryption_tag /* */ (gcry_cipher_hd_t c, const byte *tag, size_t taglen); gcry_err_code_t _gcry_cipher_siv_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_siv_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); gcry_err_code_t _gcry_cipher_siv_setkey /* */ (gcry_cipher_hd_t c, const unsigned char *ctrkey, size_t ctrkeylen); /*-- cipher-gcm-siv.c --*/ gcry_err_code_t _gcry_cipher_gcm_siv_encrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_gcm_siv_decrypt /* */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen); gcry_err_code_t _gcry_cipher_gcm_siv_set_nonce /* */ (gcry_cipher_hd_t c, const unsigned char *nonce, size_t noncelen); gcry_err_code_t _gcry_cipher_gcm_siv_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); gcry_err_code_t _gcry_cipher_gcm_siv_set_decryption_tag /* */ (gcry_cipher_hd_t c, const byte *tag, size_t taglen); gcry_err_code_t _gcry_cipher_gcm_siv_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); gcry_err_code_t _gcry_cipher_gcm_siv_check_tag /* */ (gcry_cipher_hd_t c, const unsigned char *intag, size_t taglen); gcry_err_code_t _gcry_cipher_gcm_siv_setkey /* */ (gcry_cipher_hd_t c, unsigned int keylen); /* Return the L-value for block N. Note: 'cipher_ocb.c' ensures that N * will never be multiple of 65536 (1 << OCB_L_TABLE_SIZE), thus N can * be directly passed to _gcry_ctz() function and resulting index will * never overflow the table. */ static inline const unsigned char * ocb_get_l (gcry_cipher_hd_t c, u64 n) { unsigned long ntz; #if ((defined(__i386__) || defined(__x86_64__)) && __GNUC__ >= 4) /* Assumes that N != 0. */ asm ("rep;bsfl %k[low], %k[ntz]\n\t" : [ntz] "=r" (ntz) : [low] "r" ((unsigned long)n) : "cc"); #else ntz = _gcry_ctz (n); #endif return c->u_mode.ocb.L[ntz]; } /* Return bit-shift of blocksize. */ static inline unsigned int _gcry_blocksize_shift(gcry_cipher_hd_t c) { /* Only blocksizes 8 and 16 are used. Return value in such way * that compiler can optimize calling functions based on this. */ return c->spec->blocksize == 8 ? 3 : 4; } /* Optimized function for adding value to cipher block. */ static inline void cipher_block_add(void *_dstsrc, unsigned int add, size_t blocksize) { byte *dstsrc = _dstsrc; u64 s[2]; if (blocksize == 8) { buf_put_be64(dstsrc + 0, buf_get_be64(dstsrc + 0) + add); } else /* blocksize == 16 */ { s[0] = buf_get_be64(dstsrc + 8); s[1] = buf_get_be64(dstsrc + 0); s[0] += add; s[1] += (s[0] < add); buf_put_be64(dstsrc + 8, s[0]); buf_put_be64(dstsrc + 0, s[1]); } } /* Optimized function for cipher block copying */ static inline void cipher_block_cpy(void *_dst, const void *_src, size_t blocksize) { byte *dst = _dst; const byte *src = _src; u64 s[2]; if (blocksize == 8) { buf_put_he64(dst + 0, buf_get_he64(src + 0)); } else /* blocksize == 16 */ { s[0] = buf_get_he64(src + 0); s[1] = buf_get_he64(src + 8); buf_put_he64(dst + 0, s[0]); buf_put_he64(dst + 8, s[1]); } } /* Optimized function for cipher block xoring */ static inline void cipher_block_xor(void *_dst, const void *_src1, const void *_src2, size_t blocksize) { byte *dst = _dst; const byte *src1 = _src1; const byte *src2 = _src2; u64 s1[2]; u64 s2[2]; if (blocksize == 8) { buf_put_he64(dst + 0, buf_get_he64(src1 + 0) ^ buf_get_he64(src2 + 0)); } else /* blocksize == 16 */ { s1[0] = buf_get_he64(src1 + 0); s1[1] = buf_get_he64(src1 + 8); s2[0] = buf_get_he64(src2 + 0); s2[1] = buf_get_he64(src2 + 8); buf_put_he64(dst + 0, s1[0] ^ s2[0]); buf_put_he64(dst + 8, s1[1] ^ s2[1]); } } /* Optimized function for in-place cipher block xoring */ static inline void cipher_block_xor_1(void *_dst, const void *_src, size_t blocksize) { cipher_block_xor (_dst, _dst, _src, blocksize); } /* Optimized function for cipher block xoring with two destination cipher blocks. Used mainly by CFB mode encryption. */ static inline void cipher_block_xor_2dst(void *_dst1, void *_dst2, const void *_src, size_t blocksize) { byte *dst1 = _dst1; byte *dst2 = _dst2; const byte *src = _src; u64 d2[2]; u64 s[2]; if (blocksize == 8) { d2[0] = buf_get_he64(dst2 + 0) ^ buf_get_he64(src + 0); buf_put_he64(dst2 + 0, d2[0]); buf_put_he64(dst1 + 0, d2[0]); } else /* blocksize == 16 */ { s[0] = buf_get_he64(src + 0); s[1] = buf_get_he64(src + 8); d2[0] = buf_get_he64(dst2 + 0); d2[1] = buf_get_he64(dst2 + 8); d2[0] = d2[0] ^ s[0]; d2[1] = d2[1] ^ s[1]; buf_put_he64(dst2 + 0, d2[0]); buf_put_he64(dst2 + 8, d2[1]); buf_put_he64(dst1 + 0, d2[0]); buf_put_he64(dst1 + 8, d2[1]); } } /* Optimized function for combined cipher block xoring and copying. Used by mainly CBC mode decryption. */ static inline void cipher_block_xor_n_copy_2(void *_dst_xor, const void *_src_xor, void *_srcdst_cpy, const void *_src_cpy, size_t blocksize) { byte *dst_xor = _dst_xor; byte *srcdst_cpy = _srcdst_cpy; const byte *src_xor = _src_xor; const byte *src_cpy = _src_cpy; u64 sc[2]; u64 sx[2]; u64 sdc[2]; if (blocksize == 8) { sc[0] = buf_get_he64(src_cpy + 0); buf_put_he64(dst_xor + 0, buf_get_he64(srcdst_cpy + 0) ^ buf_get_he64(src_xor + 0)); buf_put_he64(srcdst_cpy + 0, sc[0]); } else /* blocksize == 16 */ { sc[0] = buf_get_he64(src_cpy + 0); sc[1] = buf_get_he64(src_cpy + 8); sx[0] = buf_get_he64(src_xor + 0); sx[1] = buf_get_he64(src_xor + 8); sdc[0] = buf_get_he64(srcdst_cpy + 0); sdc[1] = buf_get_he64(srcdst_cpy + 8); sx[0] ^= sdc[0]; sx[1] ^= sdc[1]; buf_put_he64(dst_xor + 0, sx[0]); buf_put_he64(dst_xor + 8, sx[1]); buf_put_he64(srcdst_cpy + 0, sc[0]); buf_put_he64(srcdst_cpy + 8, sc[1]); } } /* Optimized function for combined cipher block byte-swapping. */ static inline void cipher_block_bswap (void *_dst_bswap, const void *_src_bswap, size_t blocksize) { byte *dst_bswap = _dst_bswap; const byte *src_bswap = _src_bswap; u64 t[2]; if (blocksize == 8) { buf_put_le64(dst_bswap, buf_get_be64(src_bswap)); } else { t[0] = buf_get_be64(src_bswap + 0); t[1] = buf_get_be64(src_bswap + 8); buf_put_le64(dst_bswap + 8, t[0]); buf_put_le64(dst_bswap + 0, t[1]); } } /* Optimized function for combined cipher block xoring and copying. Used by mainly CFB mode decryption. */ static inline void cipher_block_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src, size_t blocksize) { cipher_block_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, blocksize); } #endif /*G10_CIPHER_INTERNAL_H*/